In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm

import gensim
from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import category_encoders as ce

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!pip3 install --upgrade gensim==3.8.3
#!pip3 install category_encoders

In [3]:
# load all three datasets
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [4]:
df_train["train-test-val"] = 0
df_test["train-test-val"] = 1
df_validation["train-test-val"] = 2

In [5]:
df_all = pd.concat([df_train,df_test,df_validation]).reset_index(drop=True)

In [6]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    return df

In [7]:
df_all = dataPreprocessing(df_all)

In [8]:
df_y = df_all[['Label','train-test-val']]
encoder= ce.OrdinalEncoder(cols=['Label'],return_df=True,
                           mapping=[{'col':'Label',
'mapping':{'pants-fire':0,'false':1,'barely-true':2,'half-true':3,'mostly-true':4,'true':5}}])
df_y = encoder.fit_transform(df_y)

In [9]:
pf = 0
f = 0
bt = 0
ht = 0
mt = 0
t = 0
o = 0

for ind in df_y.index:
    if(df_y['Label'][ind]==0):
        pf = pf+1
    elif(df_y['Label'][ind]==1):
        f = f+1
    elif(df_y['Label'][ind]==2):
        bt = bt+1
    elif(df_y['Label'][ind]==3):
        ht = ht+1
    elif(df_y['Label'][ind]==4):
        mt = mt+1
    elif(df_y['Label'][ind]==5):
        t = t+1
    else:
        o = o+1

print("PANTS: "+str(pf))
print("F: "+str(f))
print("BT: "+str(bt))
print("HT: "+str(ht))
print("MT: "+str(mt))
print("T: "+str(t))
print("OTHER: "+str(o))

PANTS: 1047
F: 2505
BT: 2103
HT: 2627
MT: 2454
T: 2053
OTHER: 0


In [10]:
df = df_all[['Label','Statement','train-test-val']]

In [11]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df["Unigrams"] = df["Statement"].apply(tokenizer.tokenize)

In [12]:
# create vocabulary
allUnigrams = []
for unigrams in df['Unigrams']:
    for unigram in unigrams:
        allUnigrams.append(unigram)
vocabulary = sorted(list(set(allUnigrams)))
print("Vocabulary Size: "+str(len(vocabulary)))

Vocabulary Size: 13572


In [13]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

In [14]:
#word2vec_path = 'C:/Users/Nalin/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
word2vec_path = '/home/kalit/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [15]:
def get_word2vec(unigrams, generate_missing=False, k=300):
    if len(unigrams)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [model[word] if word in model else np.random.rand(k) for word in unigrams]
    else:
        vectorized = [model[word] if word in model else np.zeros(k) for word in unigrams]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(df, generate_missing=False):
    embeddings = df['Unigrams'].apply(lambda x: get_word2vec(x,generate_missing=generate_missing))
    print(type(embeddings))
    return list(embeddings)

In [16]:
embeddings = get_word2vec_embeddings(df)

<class 'pandas.core.series.Series'>


In [17]:
df_embedded_words = pd.DataFrame.from_records(embeddings) 

In [18]:
df_embedded_words["train-test-val"] = df["train-test-val"].to_numpy()

In [19]:
df_embedded_words.shape

(12789, 301)

In [20]:
x_train = df_embedded_words[df_embedded_words['train-test-val']==0]
x_test = df_embedded_words[df_embedded_words['train-test-val']==1]
x_val = df_embedded_words[df_embedded_words['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

In [21]:
print("X-Train: "+str(len(x_train)))
print("Y-Train: "+str(len(y_train)))
print("X-Test: "+str(len(x_test)))
print("Y-Test: "+str(len(y_test)))
print("X-val: "+str(len(x_val)))
print("Y-val: "+str(len(y_val)))

X-Train: 10238
Y-Train: 10238
X-Test: 1267
Y-Test: 1267
X-val: 1284
Y-val: 1284


In [22]:
model = LogisticRegression(class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=30)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [23]:
def printResults(y_test,y_predict):
    print(classification_report(y_test,y_predict))
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_predict))
    print("\n Accuracy")
    print(accuracy_score(y_test,y_predict))

In [24]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.15      0.40      0.22        92
           1       0.26      0.18      0.21       249
           2       0.17      0.17      0.17       212
           3       0.28      0.20      0.24       265
           4       0.27      0.29      0.28       241
           5       0.27      0.25      0.26       208

    accuracy                           0.23      1267
   macro avg       0.23      0.25      0.23      1267
weighted avg       0.25      0.23      0.23      1267

Confusion Matrix
[[37  9 17 10  7 12]
 [57 45 50 27 44 26]
 [51 30 36 32 33 30]
 [35 39 50 54 58 29]
 [27 23 33 42 71 45]
 [32 30 23 25 46 52]]

 Accuracy
0.23283346487766376


In [25]:
y_pred = model.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.19      0.45      0.27       116
           1       0.19      0.14      0.16       263
           2       0.23      0.19      0.21       237
           3       0.27      0.17      0.21       248
           4       0.26      0.24      0.25       251
           5       0.23      0.33      0.27       169

    accuracy                           0.23      1284
   macro avg       0.23      0.25      0.23      1284
weighted avg       0.23      0.23      0.22      1284

Confusion Matrix
[[52 19 18  3  6 18]
 [83 36 47 31 30 36]
 [48 40 45 29 44 31]
 [34 45 38 43 53 35]
 [30 38 37 25 59 62]
 [23 14 13 27 37 55]]

 Accuracy
0.22585669781931464


In [26]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
poly = svm.SVC(kernel='poly', degree=3, C=1)

In [27]:
rbf.fit(x_train,y_train)
y_pred = rbf.predict(x_test)

In [28]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.24      0.39      0.30       249
           2       0.00      0.00      0.00       212
           3       0.22      0.65      0.32       265
           4       0.29      0.08      0.13       241
           5       0.00      0.00      0.00       208

    accuracy                           0.23      1267
   macro avg       0.13      0.19      0.13      1267
weighted avg       0.15      0.23      0.15      1267

Confusion Matrix
[[  0  45   0  43   4   0]
 [  0  97   0 145   7   0]
 [  0  72   0 133   7   0]
 [  0  83   0 172  10   0]
 [  0  48   0 173  20   0]
 [  0  57   0 131  20   0]]

 Accuracy
0.2280978689818469


In [29]:
y_pred = rbf.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       116
           1       0.28      0.43      0.34       263
           2       0.00      0.00      0.00       237
           3       0.21      0.66      0.31       248
           4       0.31      0.10      0.16       251
           5       0.00      0.00      0.00       169

    accuracy                           0.24      1284
   macro avg       0.13      0.20      0.13      1284
weighted avg       0.16      0.24      0.16      1284

Confusion Matrix
[[  0  59   0  55   2   0]
 [  0 112   0 139  12   0]
 [  0  73   0 156   8   0]
 [  0  67   0 164  17   0]
 [  0  58   0 167  26   0]
 [  0  36   0 115  18   0]]

 Accuracy
0.235202492211838


In [30]:
poly.fit(x_train,y_train)
y_pred = poly.predict(x_test)

In [31]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.15      0.04      0.07        92
           1       0.28      0.28      0.28       249
           2       0.26      0.12      0.17       212
           3       0.24      0.48      0.32       265
           4       0.26      0.32      0.29       241
           5       0.21      0.07      0.11       208

    accuracy                           0.25      1267
   macro avg       0.23      0.22      0.21      1267
weighted avg       0.24      0.25      0.23      1267

Confusion Matrix
[[  4  27  11  33  12   5]
 [  7  70  22  85  53  12]
 [  4  45  26  91  35  11]
 [  4  47  24 127  56   7]
 [  3  25   6 108  77  22]
 [  5  39  12  77  60  15]]

 Accuracy
0.2517758484609313


In [32]:
y_pred = poly.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.27      0.07      0.11       116
           1       0.30      0.30      0.30       263
           2       0.23      0.09      0.13       237
           3       0.23      0.52      0.32       248
           4       0.24      0.24      0.24       251
           5       0.23      0.11      0.15       169

    accuracy                           0.25      1284
   macro avg       0.25      0.22      0.21      1284
weighted avg       0.25      0.25      0.22      1284

Confusion Matrix
[[  8  40  10  45   9   4]
 [ 11  78  24  94  40  16]
 [  6  45  21 112  41  12]
 [  0  36  17 129  55  11]
 [  1  45  12 110  61  22]
 [  4  18   9  66  53  19]]

 Accuracy
0.24610591900311526
