In [1]:
#import libraries
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

import gensim
from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import category_encoders as ce

import torch
from torchtext import data, datasets, vocab, utils
import torch.nn as nn
import torch.nn.functional as F
import torchsummary
import torch.optim as optim

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!pip3 install --upgrade gensim==3.8.3
#!pip3 install category_encoders

In [3]:
# load all three datasets
trainFilePath = '../dataset/train2.tsv'
testFilePath = '../dataset/test2.tsv'
validationFilePath = '../dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [4]:
df_train["train-test-val"] = 0
df_test["train-test-val"] = 1
df_validation["train-test-val"] = 2

In [5]:
#merge datasets, seperate while training the model
df_all = pd.concat([df_train,df_test,df_validation]).reset_index(drop=True)

In [6]:
df_all.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job Title,State,Party,Barely True Cnt,False Cnt,Half True Cnt,Mostly True Cnt,Pants on Fire Cnt,Context,Justification,train-test-val
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,0
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",0
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...,0
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...,0
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start...",0


### Data Preprocessing

Remove all the rows having NAs. Clean the text. Encode multiclass and binary labels

In [7]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    le_multi = LabelEncoder()
    df.loc[:, 'Multi Class Label'] = le_multi.fit_transform(df.Label)
    print("Label assignments: " + str({l: i for i, l in enumerate(le_multi.classes_)}))   
    
    df['Binary Label'] = df.Label.apply(lambda x: 1 if x in ['false','pants-fire','barely-true']  else 0)

    return df

In [8]:
df_all = dataPreprocessing(df_all)

Label assignments: {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3, 'pants-fire': 4, 'true': 5}


In [9]:
df_all.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job Title,State,Party,Barely True Cnt,False Cnt,Half True Cnt,Mostly True Cnt,Pants on Fire Cnt,Context,Justification,train-test-val,Multi Class Label,Binary Label
0,2635,false,says the annies list political group supports ...,abortion,dwayne bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,that s a premise that he fails to back up ann...,0,1,1
1,10540,half-true,when did the decline of coal start it started...,energy history job accomplishments,scott surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech,surovell said the decline of coal started whe...,0,2,0
2,324,mostly-true,hillary clinton agrees with john mccain by vo...,foreign policy,barack obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver,obama said he would have voted against the ame...,0,3,0
3,1123,false,health care reform legislation is likely to ma...,health care,blog posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,the release may have a point that mikulskis co...,0,1,1
4,9028,half-true,the economic turnaround started at the end of ...,economy jobs,charlie crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on cnn,crist said that the economic turnaround start...,0,2,0


In [10]:
# Extract out the required fields
df = df_all[['Statement','train-test-val']]

In [11]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df["Unigrams"] = df["Statement"].apply(tokenizer.tokenize)

In [12]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

In [13]:
#word2vec_path = 'C:/Users/Nalin/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
word2vec_path = '/home/kalit/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
embedding_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [14]:
#Convert word representation using word embeddings
def get_word2vec(unigrams, generate_missing=False, k=300):
    if len(unigrams)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [embedding_model[word] if word in embedding_model else np.random.rand(k) for word in unigrams]
    else:
        vectorized = [embedding_model[word] if word in embedding_model else np.zeros(k) for word in unigrams]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(df, generate_missing=False):
    embeddings = df['Unigrams'].apply(lambda x: get_word2vec(x,generate_missing=generate_missing))
    return list(embeddings)

In [15]:
df_embedded_words = pd.DataFrame.from_records(get_word2vec_embeddings(df))

In [16]:
df_embedded_words["train-test-val"] = df["train-test-val"].to_numpy()

In [17]:
df_embedded_words.shape

(12789, 301)

### Binary Classification

In [18]:
# Divide the dataset into training, validation and testing parts
# Also, divide into input features and target labels

df_y = df_all[['Binary Label','train-test-val']]

x_train = df_embedded_words[df_embedded_words['train-test-val']==0]
x_test = df_embedded_words[df_embedded_words['train-test-val']==1]
x_val = df_embedded_words[df_embedded_words['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

In [19]:
def printResults(y_test,y_predict):
    print(classification_report(y_test,y_predict))
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_predict))
    print("\n Accuracy")
    print(accuracy_score(y_test,y_predict))

#### Logistic Regression

In [20]:
model = LogisticRegression(class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=30,C=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# Results on testing data
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.65      0.61      0.63       714
           1       0.54      0.59      0.56       553

    accuracy                           0.60      1267
   macro avg       0.59      0.60      0.59      1267
weighted avg       0.60      0.60      0.60      1267

Confusion Matrix
[[433 281]
 [229 324]]

 Accuracy
0.5974743488555643


In [21]:
# Results on validation data
y_pred = model.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.65      0.60      0.62       668
           1       0.60      0.64      0.62       616

    accuracy                           0.62      1284
   macro avg       0.62      0.62      0.62      1284
weighted avg       0.62      0.62      0.62      1284

Confusion Matrix
[[403 265]
 [221 395]]

 Accuracy
0.6214953271028038


#### SVM-rbf

In [22]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
rbf.fit(x_train,y_train)
y_pred = rbf.predict(x_test)

# Results on testing data
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.59      0.90      0.71       714
           1       0.60      0.20      0.29       553

    accuracy                           0.59      1267
   macro avg       0.60      0.55      0.50      1267
weighted avg       0.59      0.59      0.53      1267

Confusion Matrix
[[642  72]
 [445 108]]

 Accuracy
0.5919494869771112


In [23]:
# Results on validation data
y_pred = rbf.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.56      0.91      0.69       668
           1       0.69      0.22      0.33       616

    accuracy                           0.58      1284
   macro avg       0.62      0.56      0.51      1284
weighted avg       0.62      0.58      0.52      1284

Confusion Matrix
[[607  61]
 [480 136]]

 Accuracy
0.5786604361370716


#### SVM-poly

In [24]:
poly = svm.SVC(kernel='poly', degree=3, C=1)
poly.fit(x_train,y_train)

# Results on testing data
y_pred = poly.predict(x_test)
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.61      0.84      0.71       714
           1       0.60      0.31      0.41       553

    accuracy                           0.61      1267
   macro avg       0.60      0.57      0.56      1267
weighted avg       0.60      0.61      0.58      1267

Confusion Matrix
[[597 117]
 [381 172]]

 Accuracy
0.6069455406471981


In [25]:
# Results on validation data

y_pred = poly.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.59      0.86      0.70       668
           1       0.71      0.36      0.48       616

    accuracy                           0.62      1284
   macro avg       0.65      0.61      0.59      1284
weighted avg       0.65      0.62      0.59      1284

Confusion Matrix
[[576  92]
 [394 222]]

 Accuracy
0.6214953271028038


### Multi class classification

In [26]:
# Divide the dataset into training, validation and testing parts
# Also, divide into input features and target labels

df_y = df_all[['Multi Class Label','train-test-val']]

x_train = df_embedded_words[df_embedded_words['train-test-val']==0]
x_test = df_embedded_words[df_embedded_words['train-test-val']==1]
x_val = df_embedded_words[df_embedded_words['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

#### Logistic Regression

In [27]:
model = LogisticRegression(class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=30,C=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# Results on testing data
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.17      0.17      0.17       212
           1       0.26      0.18      0.21       249
           2       0.28      0.20      0.24       265
           3       0.27      0.29      0.28       241
           4       0.15      0.40      0.22        92
           5       0.27      0.25      0.26       208

    accuracy                           0.23      1267
   macro avg       0.23      0.25      0.23      1267
weighted avg       0.25      0.23      0.23      1267

Confusion Matrix
[[36 30 32 33 51 30]
 [50 45 27 44 57 26]
 [50 39 54 58 35 29]
 [33 23 42 71 27 45]
 [17  9 10  7 37 12]
 [23 30 25 46 32 52]]

 Accuracy
0.23283346487766376


In [28]:
# Results on validation data
y_pred = model.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.23      0.19      0.21       237
           1       0.19      0.14      0.16       263
           2       0.27      0.17      0.21       248
           3       0.26      0.24      0.25       251
           4       0.19      0.45      0.27       116
           5       0.23      0.33      0.27       169

    accuracy                           0.23      1284
   macro avg       0.23      0.25      0.23      1284
weighted avg       0.23      0.23      0.22      1284

Confusion Matrix
[[45 40 29 44 48 31]
 [47 36 31 30 83 36]
 [38 45 43 53 34 35]
 [37 38 25 59 30 62]
 [18 19  3  6 52 18]
 [13 14 27 37 23 55]]

 Accuracy
0.22585669781931464


#### SVM-rbf

In [29]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
rbf.fit(x_train,y_train)
y_pred = rbf.predict(x_test)

# Results on testing data
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.24      0.39      0.30       249
           2       0.22      0.65      0.32       265
           3       0.29      0.08      0.13       241
           4       0.00      0.00      0.00        92
           5       0.00      0.00      0.00       208

    accuracy                           0.23      1267
   macro avg       0.13      0.19      0.13      1267
weighted avg       0.15      0.23      0.15      1267

Confusion Matrix
[[  0  72 133   7   0   0]
 [  0  97 145   7   0   0]
 [  0  83 172  10   0   0]
 [  0  48 173  20   0   0]
 [  0  45  43   4   0   0]
 [  0  57 131  20   0   0]]

 Accuracy
0.2280978689818469


In [30]:
# Results on validation data
y_pred = rbf.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       237
           1       0.28      0.43      0.34       263
           2       0.21      0.66      0.31       248
           3       0.31      0.10      0.16       251
           4       0.00      0.00      0.00       116
           5       0.00      0.00      0.00       169

    accuracy                           0.24      1284
   macro avg       0.13      0.20      0.13      1284
weighted avg       0.16      0.24      0.16      1284

Confusion Matrix
[[  0  73 156   8   0   0]
 [  0 112 139  12   0   0]
 [  0  67 164  17   0   0]
 [  0  58 167  26   0   0]
 [  0  59  55   2   0   0]
 [  0  36 115  18   0   0]]

 Accuracy
0.235202492211838


#### SVM-poly

In [31]:
poly = svm.SVC(kernel='poly', degree=3, C=1)
poly.fit(x_train,y_train)

# Results on testing data
y_pred = poly.predict(x_test)
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.26      0.13      0.18       212
           1       0.28      0.28      0.28       249
           2       0.24      0.48      0.32       265
           3       0.26      0.32      0.29       241
           4       0.18      0.04      0.07        92
           5       0.21      0.07      0.11       208

    accuracy                           0.25      1267
   macro avg       0.24      0.22      0.21      1267
weighted avg       0.25      0.25      0.23      1267

Confusion Matrix
[[ 28  44  91  35   3  11]
 [ 24  70  85  53   5  12]
 [ 25  46 127  56   4   7]
 [  6  25 108  77   3  22]
 [ 11  27  33  12   4   5]
 [ 12  40  78  60   3  15]]

 Accuracy
0.2533543804262036


In [32]:
# Results on validation data

y_pred = poly.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.22      0.09      0.13       237
           1       0.30      0.29      0.29       263
           2       0.23      0.52      0.32       248
           3       0.23      0.24      0.24       251
           4       0.27      0.06      0.10       116
           5       0.23      0.11      0.15       169

    accuracy                           0.24      1284
   macro avg       0.25      0.22      0.21      1284
weighted avg       0.25      0.24      0.22      1284

Confusion Matrix
[[ 22  45 112  41   5  12]
 [ 28  76  94  40   9  16]
 [ 17  36 129  55   0  11]
 [ 12  45 110  61   1  22]
 [ 12  38  45  10   7   4]
 [ 10  17  66  53   4  19]]

 Accuracy
0.24454828660436137
