In [10]:
import re
import nltk
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from fast_ml.model_development import train_valid_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shivamarora/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

remove duplicates \
drop rows with null values \
convert to string and then unicode \
one hot encoder \
tokenization \
remove stop words \
position tagging \
filter out verbs adjectives nouns \
lemmatization (or stemming) \
test train valid split


how to consider multiple columns as single document for tfidf? one approach - ignore numerical values and just append all text together into a single column \
what is the numerical data? is it important for classification or should we get rid of it \
what pipeline are they talking about - explore results with and without pipeline

In [11]:
# Reading Data and integrating into one dataframe so we can implement preprocessing and split later

df = pd.read_csv('../data/train.tsv', sep='\t')
df1 = pd.read_csv('../data/test.tsv', sep='\t')
df2 = pd.read_csv('../data/valid.tsv', sep='\t')
columns = ["ID","label","statement","subject","speaker","jobtitle","state","party","barely","false","half","mostly","pantsfire","context"]

df = pd.DataFrame(np.vstack([df.columns, df]))
df1 = pd.DataFrame(np.vstack([df1.columns, df1]))
df2 = pd.DataFrame(np.vstack([df2.columns, df2]))

df.columns = columns
df1.columns = columns
df2.columns = columns

df = df.append(df1, ignore_index = True)
df = df.append(df2, ignore_index = True)

df = df.drop_duplicates()
df = df.dropna()
df = df.reset_index(drop=True)

df['label'] = df['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true' : 3, 'mostly-true' : 4, 'true' : 5})


#df['statement'] = df['statement'] + " " + df['context']
#df = df.drop('context', axis=1)


#labelencoder = LabelEncoder()
#df['speaker'] = labelencoder.fit_transform(df['speaker'])
#df['subject'] = labelencoder.fit_transform(df['subject'])
#df['jobtitle'] = labelencoder.fit_transform(df['jobtitle'])
#df['state'] = labelencoder.fit_transform(df['state'])
#df['party'] = labelencoder.fit_transform(df['party'])
#df['context'] = labelencoder.fit_transform(df['context'])

In [12]:
# Remove punctuation, convert to lowercase, remove stopwords, filter nouns adjectives verbs, stemming

def preprocess(df, col):
    porter = PorterStemmer()
    filterpos = ["JJ","JJR","JJS","NN","NNS","NNP","NNPS","POS","VB","VBD","VBG","VBN","VBP","VBZ","FW"]
    df.loc[:, col] = df[col].apply(lambda x : str.lower(x))
    df.loc[:,col] = df[col].apply(lambda x : " ".join(re.findall('[\w]+',x)))
    stop = stopwords.words('english')
    df[col] = df[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    df[col] = df[col].apply(lambda x: ' '.join([word for word in x.split() if (nltk.pos_tag([word]))[0][1] in filterpos]))
    df[col] = df[col].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))
    return df


In [13]:
df.head()

Unnamed: 0,ID,label,statement,subject,speaker,jobtitle,state,party,barely,false,half,mostly,pantsfire,context
0,2635.json,1,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
1,10540.json,3,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1.0,1.0,0.0,a floor speech.
2,324.json,4,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160.0,163.0,9.0,Denver
3,12465.json,5,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2.0,5.0,1.0,a an online opinion-piece
4,153.json,3,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70,71,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."


In [16]:
df

Unnamed: 0,ID,label,statement,subject,speaker,jobtitle,state,party,barely,false,half,mostly,pantsfire,context
0,2635.json,1,say anni list polit group support third trimes...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
1,10540.json,3,declin coal start start natur ga took start be...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
2,324.json,4,hillari clinton agre john mccain vote give geo...,foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
3,12465.json,5,chicago bear start quarterback last year total...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece
4,153.json,3,person stage work last year pass russ feingold...,ethics,barack-obama,President,Illinois,democrat,70,71,160,163,9,"a Democratic debate in Philadelphia, Pa."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8433,7013.json,2,say u rep charl bass want privat social secur,social-security,ann-mclane-kuster,Attorney,New Hampshire,democrat,2,1,3,0,0,"an ad, “Janice,” released September 6, 2012"
8434,2661.json,0,past year democrat spent money countri spent l...,"federal-budget,history",eric-cantor,House Majority Leader,Virginia,republican,9,6,4,4,4,an interview on Comedy Central's Daily Show wi...
8435,3419.json,3,time decad import account half oil consum,"energy,oil-spill,trade",barack-obama,President,Illinois,democrat,70,71,160,163,9,a press conference
8436,12548.json,4,say donald trump bankrupt compani time,candidates-biography,hillary-clinton,Presidential candidate,New York,democrat,40,29,69,76,7,a speech on the economy


In [14]:
df = preprocess(df, 'statement')

In [15]:
df.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,jobtitle,state,party,barely,false,half,mostly,pantsfire,context
0,2635.json,1,say anni list polit group support third trimes...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
1,10540.json,3,declin coal start start natur ga took start be...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1.0,1.0,0.0,a floor speech.
2,324.json,4,hillari clinton agre john mccain vote give geo...,foreign-policy,barack-obama,President,Illinois,democrat,70,71,160.0,163.0,9.0,Denver
3,12465.json,5,chicago bear start quarterback last year total...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2.0,5.0,1.0,a an online opinion-piece
4,153.json,3,person stage work last year pass russ feingold...,ethics,barack-obama,President,Illinois,democrat,70,71,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."


In [8]:
df['statement'][0]

'says annies list political group supports third trimester abortions demand'

In [13]:
df = preprocess(df, 'subject')
df = preprocess(df, 'speaker')
df = preprocess(df, 'jobtitle')
df = preprocess(df, 'state')
df = preprocess(df, 'party')
df = preprocess(df, 'context')

In [14]:
df.head()

Unnamed: 0,ID,label,statement,subject,speaker,jobtitle,state,party,barely,false,half,mostly,pantsfire,context
0,2635.json,1,say anni list polit group support third trimes...,abort,dwayn bohac,state repres,texa,republican,0,1,0.1,0.2,0.3,mailer
1,10540.json,3,declin coal start start natur ga took start be...,energi histori job accomplish,scott surovel,state deleg,virginia,democrat,0,0,1.0,1.0,0.0,floor speech
2,324.json,4,hillari clinton agre john mccain vote give geo...,foreign polici,barack obama,presid,illinoi,democrat,70,71,160.0,163.0,9.0,denver
3,12465.json,5,chicago bear start quarterback last year total...,educ,robin vo,wisconsin speaker,wisconsin,republican,0,3,2.0,5.0,1.0,onlin opinion piec
4,153.json,3,person stage work last year pass russ feingold...,ethic,barack obama,presid,illinoi,democrat,70,71,160.0,163.0,9.0,democrat debat philadelphia pa


In [16]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'label', train_size=0.8, valid_size=0.1, test_size=0.1)

In [17]:
print(y_test.value_counts())

1    187
3    185
5    146
4    145
2    133
0     48
Name: label, dtype: int64


In [18]:
def identity_tokenizer(text):
    return text
tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False, max_features=3500)
tfidf_vectorizer.fit(df['statement'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0, max_features=3500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function identity_tokenizer at 0x00000214A54BB558>,
                use_idf=True, vocabulary=None)

In [19]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train['statement'])
tfidf_valid_vectors = tfidf_vectorizer.fit_transform(X_valid['statement'])
tfidf_test_vectors = tfidf_vectorizer.fit_transform(X_test['statement'])

In [20]:
tfidf_train_vectors

<6750x36 sparse matrix of type '<class 'numpy.float64'>'
	with 108940 stored elements in Compressed Sparse Row format>

In [21]:
tfidf_test_vectors

<844x30 sparse matrix of type '<class 'numpy.float64'>'
	with 13561 stored elements in Compressed Sparse Row format>

In [22]:
tfidf_valid_vectors

<844x32 sparse matrix of type '<class 'numpy.float64'>'
	with 13615 stored elements in Compressed Sparse Row format>

In [23]:
transformer = FeatureUnion([
                ('statement_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['statement'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('subject_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['subject'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('speaker_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['speaker'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('jobtitle_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['jobtitle'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('state_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['state'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('party_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['party'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('context_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['context'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))

])

In [24]:
tfidf_train_vectors = transformer.fit(X_train)
Xt = transformer.transform(X_train)

In [25]:
tfidf_test_vectors = transformer.fit(X_test)
transformer.transform(X_test)

<844x3815 sparse matrix of type '<class 'numpy.float64'>'
	with 17705 stored elements in Compressed Sparse Row format>

In [26]:
tfidf_valid_vectors = transformer.fit(X_valid)
transformer.transform(X_valid)

<844x3962 sparse matrix of type '<class 'numpy.float64'>'
	with 17765 stored elements in Compressed Sparse Row format>

In [27]:
Xt

<6750x10531 sparse matrix of type '<class 'numpy.float64'>'
	with 142787 stored elements in Compressed Sparse Row format>

In [28]:
Xt_df_sparse = pd.DataFrame.sparse.from_spmatrix(Xt)

In [29]:
classifier = XGBClassifier()
print("fitting classifier")
classifier.fit(Xt_df_sparse, y_train)
print("classifier fit")
yt_proba = classifier.predict_proba(Xt_df_sparse)

fitting classifier
classifier fit


In [30]:
print(yt_proba)

[[0.02302412 0.19292921 0.20823325 0.16269194 0.2540677  0.15905383]
 [0.04053952 0.07868341 0.4150491  0.15522408 0.20587987 0.10462398]
 [0.02194933 0.07161867 0.05847688 0.6051747  0.1315207  0.11125972]
 ...
 [0.04736154 0.09333981 0.29707366 0.26894635 0.1840483  0.10923032]
 [0.04038815 0.11033293 0.5937673  0.0872489  0.10282087 0.06544182]
 [0.0528703  0.13778262 0.069204   0.06464471 0.09895712 0.57654124]]


In [31]:
y_pred = np.argmax(yt_proba, axis=1)

In [42]:
y_pred

array([4, 2, 3, ..., 2, 2, 5], dtype=int64)

In [44]:
y_true = np.array(y_train.values)
y_true

array([2, 2, 3, ..., 2, 2, 5], dtype=int64)

In [45]:
# print accuracy
print("Accuracy: ", accuracy_score(y_true, y_pred))

# print precision, recall, F1-score per each class/tag
print(classification_report(y_true, y_pred))

# print confusion matrix, check documentation for sorting rows/columns
print(confusion_matrix(y_true, y_pred))

Accuracy:  0.8121481481481482
              precision    recall  f1-score   support

           0       0.99      0.70      0.82       433
           1       0.77      0.84      0.80      1292
           2       0.90      0.73      0.81      1059
           3       0.82      0.84      0.83      1420
           4       0.75      0.87      0.81      1407
           5       0.83      0.80      0.81      1139

    accuracy                           0.81      6750
   macro avg       0.84      0.80      0.81      6750
weighted avg       0.82      0.81      0.81      6750

[[ 302   44    9   28   28   22]
 [   1 1082   20   66   74   49]
 [   1   88  773   62   99   36]
 [   0   69   19 1193   96   43]
 [   2   64   23   59 1222   37]
 [   0   60   12   53  104  910]]
