In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
import nltk
import nltk.sentiment.vader as vader

In [4]:
import spacy
spacy.load("en_core_web_sm")

<spacy.lang.en.English at 0x2369b2f4a90>

In [9]:
train = pd.read_csv('.././raw_data/fulltrain.csv', sep=',', header=None, names=['Verdict', 'Text'])
X_train = train['Text']
y_train = train['Verdict']

In [10]:
train

Unnamed: 0,Verdict,Text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...
...,...,...
48849,4,The ruling Kuomintang (KMT) has claimed owners...
48850,4,The Taipei city government has encouraged the ...
48851,4,President Ma Ying-jeou said Friday that a park...
48852,4,The families of the four people who were kille...


In [23]:
def tokenize(df):
    df['tokens'] = df['Text'].apply(nltk.word_tokenize)

In [27]:
def get_claim_length(df):
    if not 'tokens' in df.columns:
        tokenize(df)
    df['claim_length'] = df['tokens'].apply(len, axis=1)

In [26]:
def get_named_entities(df):
    if not 'tokens' in df.columns:
        tokenize(df)
    df['named_entities'] = df['tokens'].apply(lambda x: nltk.ne_chunk(nltk.pos_tag(x)), axis=1)

In [25]:
def get_sentiment(df):
    if not 'tokens' in df.columns:
        tokenize(df)
    sid = vader.SentimentIntensityAnalyzer()
    df['sentiment'] = df['tokens'].apply(lambda x: sid.polarity_scores(x)["compound"], axis=1)

In [29]:
tags = ['IN', 'CC', 'JJ', 'MD', 'NN', 'RB', 'VB', 'PR', 'UH', 'DT']
tag_mappings = {'TO':'IN', 'VBN':'VB', 'WP':'PR', 'UH':'UH', 'VBG':'VB', 'JJ':'JJ', 'VBZ':'VB', 'VBP':'VB', 'NN':'NN', 'DT':'DT', 'PRP':'PR', 'WP$':'PR', 'NNPS':'NN', 'PRP$':'PR', 'WDT':'DT', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VBD':'VB', 'IN':'IN', 'RP':'IN', 'JJR':'JJ', 'JJS':'JJ', 'PDT':'DT', 'MD':'MD', 'VB':'VB', 'WRB':'RB', 'NNP':'NN', 'NNS':'NN', 'CC':'CC'}

def get_pos_count(tokens):
    tag_count = Counter()
    total_count = len(tokens)

    for word, tag in nltk.pos_tag(tokens):
        if tag in tag_mappings:
            tag_count[tag_mappings[tag]] += 1

    for tag in tags:
        tag_count[tag] = tag_count[tag] / total_count if tag in tag_count else 0

    return tag_count / total_count

def pos_tag(df):
    if not 'tokens' in df.columns:
        tokenize(df)
    tag_count = df['tokens'].apply(get_pos_count, axis=1)

    for tag in tags:
        df[f'ratio_{tag}'] = tag_count.apply(lambda x: x[tag])
        df[f'has_{tag}'] = tag_count.apply(lambda x: 1 if x[tag] > 0 else 0)


In [5]:
! pip install py-readability-metrics
! python -m nltk.downloader punkt

Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl.metadata (8.8 kB)
Downloading py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Installing collected packages: py-readability-metrics
Successfully installed py-readability-metrics-1.4.5


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
from readability import Readability

def get_readability(df):
    df['readability'] = df['Text'].apply(lambda x: Readability(x).flesch_kincaid().score, axis=1)


In [32]:
tokenize(train)
print('tokenised')
get_claim_length(train)
print('length')
get_named_entities(train)
print('entities')
get_sentiment(train)
print('sentiment')
pos_tag(train)
print('pos')
get_readability(train)
print('readability')

KeyboardInterrupt: 

In [None]:
def get_features(df, features: list):
    return df.apply(lambda x: np.array([x[feature] for feature in features]), axis=1)

In [None]:
get_features(train, ['claim_length', 'sentiment', 'ratio_IN', 'ratio_CC', 'ratio_JJ', 'ratio_MD', 'ratio_NN', 'ratio_RB', 'ratio_VB', 'ratio_PR', 'ratio_UH', 'ratio_DT', 'has_IN', 'has_CC', 'has_JJ', 'has_MD', 'has_NN', 'has_RB', 'has_VB', 'has_PR', 'has_UH', 'has_DT'])