In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm

import gensim
from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import category_encoders as ce

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!pip3 install --upgrade gensim==3.8.3
#!pip3 install category_encoders

In [3]:
# load all three datasets
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [4]:
df_train["train-test-val"] = 0
df_test["train-test-val"] = 1
df_validation["train-test-val"] = 2

In [5]:
df_all = pd.concat([df_train,df_test,df_validation]).reset_index(drop=True)

In [6]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    df[field].fillna('', inplace=True)
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    return df

In [7]:
df_all = dataPreprocessing(df_all)

In [8]:
df_all.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job Title,State,Party,Barely True Cnt,False Cnt,Half True Cnt,Mostly True Cnt,Pants on Fire Cnt,Context,Justification,train-test-val
0,2635,false,says the annies list political group supports ...,abortion,dwayne bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,that s a premise that he fails to back up ann...,0
1,10540,half-true,when did the decline of coal start it started...,energy history job accomplishments,scott surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech,surovell said the decline of coal started whe...,0
2,324,mostly-true,hillary clinton agrees with john mccain by vo...,foreign policy,barack obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver,obama said he would have voted against the ame...,0
3,1123,false,health care reform legislation is likely to ma...,health care,blog posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,the release may have a point that mikulskis co...,0
4,9028,half-true,the economic turnaround started at the end of ...,economy jobs,charlie crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on cnn,crist said that the economic turnaround start...,0


In [9]:
df_y = df_all[['Label','train-test-val']]
encoder= ce.OrdinalEncoder(cols=['Label'],return_df=True,
                           mapping=[{'col':'Label',
'mapping':{'pants-fire':0,'false':0,'barely-true':0,'half-true':1,'mostly-true':1,'true':1}}])
df_y = encoder.fit_transform(df_y)

In [10]:
df = df_all[['Label','Statement','Justification','train-test-val']]

In [11]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df["Statement-Unigrams"] = df["Statement"].apply(tokenizer.tokenize)
df["Justification-Unigrams"] = df["Justification"].apply(tokenizer.tokenize)

In [12]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

In [13]:
#word2vec_path = 'C:/Users/Nalin/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
word2vec_path = '/home/kalit/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [14]:
def get_word2vec(unigrams, generate_missing=False, k=300):
    if len(unigrams)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [model[word] if word in model else np.random.rand(k) for word in unigrams]
    else:
        vectorized = [model[word] if word in model else np.zeros(k) for word in unigrams]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(df,field, generate_missing=False):
    embeddings = df[field].apply(lambda x: get_word2vec(x,generate_missing=generate_missing))
    return list(embeddings)

In [15]:
embeddings_statement = get_word2vec_embeddings(df,'Statement-Unigrams')

In [16]:
embeddings_justification = get_word2vec_embeddings(df,'Justification-Unigrams')

In [47]:
df_embedded_statement = pd.DataFrame.from_records(embeddings_statement) 
df_embedded_justification = pd.DataFrame.from_records(embeddings_justification) 
df_embedded = pd.concat([df_embedded_statement,df_embedded_justification],axis=1)

In [48]:
df_embedded["train-test-val"] = df["train-test-val"].to_numpy()

In [49]:
x_train = df_embedded[df_embedded['train-test-val']==0]
x_test = df_embedded[df_embedded['train-test-val']==1]
x_val = df_embedded[df_embedded['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

In [50]:
print("X-Train: "+str(len(x_train)))
print("Y-Train: "+str(len(y_train)))
print("X-Test: "+str(len(x_test)))
print("Y-Test: "+str(len(y_test)))
print("X-val: "+str(len(x_val)))
print("Y-val: "+str(len(y_val)))

X-Train: 10238
Y-Train: 10238
X-Test: 1267
Y-Test: 1267
X-val: 1284
Y-val: 1284


In [51]:
model = LogisticRegression(class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=30)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [52]:
def printResults(y_test,y_predict):
    print(classification_report(y_test,y_predict))
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_predict))
    print("\n Accuracy")
    print(accuracy_score(y_test,y_predict))

In [53]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.54      0.59      0.56       553
           1       0.66      0.61      0.63       714

    accuracy                           0.60      1267
   macro avg       0.60      0.60      0.60      1267
weighted avg       0.61      0.60      0.60      1267

Confusion Matrix
[[328 225]
 [282 432]]

 Accuracy
0.5998421468034728


In [54]:
y_pred = model.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.61      0.63      0.62       616
           1       0.65      0.63      0.64       668

    accuracy                           0.63      1284
   macro avg       0.63      0.63      0.63      1284
weighted avg       0.63      0.63      0.63      1284

Confusion Matrix
[[390 226]
 [245 423]]

 Accuracy
0.633177570093458


In [55]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
poly = svm.SVC(kernel='poly', degree=3, C=1)

In [56]:
rbf.fit(x_train,y_train)
y_pred = rbf.predict(x_test)

In [57]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.58      0.15      0.24       553
           1       0.58      0.91      0.71       714

    accuracy                           0.58      1267
   macro avg       0.58      0.53      0.48      1267
weighted avg       0.58      0.58      0.51      1267

Confusion Matrix
[[ 85 468]
 [ 62 652]]

 Accuracy
0.5816890292028414


In [58]:
y_pred = rbf.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.69      0.21      0.32       616
           1       0.56      0.92      0.69       668

    accuracy                           0.58      1284
   macro avg       0.62      0.56      0.50      1284
weighted avg       0.62      0.58      0.51      1284

Confusion Matrix
[[127 489]
 [ 56 612]]

 Accuracy
0.5755451713395638


In [59]:
poly.fit(x_train,y_train)
y_pred = poly.predict(x_test)

In [60]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.61      0.34      0.44       553
           1       0.62      0.83      0.71       714

    accuracy                           0.62      1267
   macro avg       0.61      0.59      0.57      1267
weighted avg       0.62      0.62      0.59      1267

Confusion Matrix
[[189 364]
 [121 593]]

 Accuracy
0.6172059984214681


In [61]:
y_pred = poly.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.70      0.38      0.49       616
           1       0.60      0.85      0.70       668

    accuracy                           0.62      1284
   macro avg       0.65      0.61      0.59      1284
weighted avg       0.64      0.62      0.60      1284

Confusion Matrix
[[232 384]
 [101 567]]

 Accuracy
0.6222741433021807
