In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm

import gensim
from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import category_encoders as ce

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!pip3 install --upgrade gensim==3.8.3
#!pip3 install category_encoders

In [3]:
# load all three datasets
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [4]:
df_train["train-test-val"] = 0
df_test["train-test-val"] = 1
df_validation["train-test-val"] = 2

In [5]:
df_all = pd.concat([df_train,df_test,df_validation]).reset_index(drop=True)

In [6]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    return df

In [7]:
df_all = dataPreprocessing(df_all)

In [8]:
df_y = df_all[['Label','train-test-val']]
encoder= ce.OrdinalEncoder(cols=['Label'],return_df=True,
                           mapping=[{'col':'Label',
'mapping':{'pants-fire':0,'false':0,'barely-true':0,'half-true':1,'mostly-true':1,'true':1}}])
df_y = encoder.fit_transform(df_y)

In [9]:
f = 0
t = 0
o = 0

for ind in df_y.index:
    if(df_y['Label'][ind]==0):
        f = f+1
    elif(df_y['Label'][ind]==1):
        t = t+1
    else:
        o = o+1

print("F: "+str(f))
print("T: "+str(t))
print("OTHER: "+str(o))

F: 5655
T: 7134
OTHER: 0


In [10]:
df = df_all[['Label','Statement','train-test-val']]

In [11]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df["Unigrams"] = df["Statement"].apply(tokenizer.tokenize)

In [12]:
# create vocabulary
allUnigrams = []
for unigrams in df['Unigrams']:
    for unigram in unigrams:
        allUnigrams.append(unigram)
vocabulary = sorted(list(set(allUnigrams)))
print("Vocabulary Size: "+str(len(vocabulary)))

Vocabulary Size: 13572


In [13]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

In [14]:
word2vec_path = 'C:/Users/Nalin/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [15]:
def get_word2vec(unigrams, generate_missing=False, k=300):
    if len(unigrams)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [model[word] if word in model else np.random.rand(k) for word in unigrams]
    else:
        vectorized = [model[word] if word in model else np.zeros(k) for word in unigrams]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(df, generate_missing=False):
    embeddings = df['Unigrams'].apply(lambda x: get_word2vec(x,generate_missing=generate_missing))
    print(type(embeddings))
    return list(embeddings)

In [16]:
embeddings = get_word2vec_embeddings(df)

<class 'pandas.core.series.Series'>


In [17]:
df_embedded_words = pd.DataFrame.from_records(embeddings) 

In [18]:
df_embedded_words["train-test-val"] = df["train-test-val"].to_numpy()

In [19]:
df_embedded_words.shape

(12789, 301)

In [20]:
x_train = df_embedded_words[df_embedded_words['train-test-val']==0]
x_test = df_embedded_words[df_embedded_words['train-test-val']==1]
x_val = df_embedded_words[df_embedded_words['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

In [21]:
print("X-Train: "+str(len(x_train)))
print("Y-Train: "+str(len(y_train)))
print("X-Test: "+str(len(x_test)))
print("Y-Test: "+str(len(y_test)))
print("X-val: "+str(len(x_val)))
print("Y-val: "+str(len(y_val)))

X-Train: 10238
Y-Train: 10238
X-Test: 1267
Y-Test: 1267
X-val: 1284
Y-val: 1284


In [22]:
model = LogisticRegression(class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=30,C=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [23]:
def printResults(y_test,y_predict):
    print(classification_report(y_test,y_predict))
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_predict))
    print("\n Accuracy")
    print(accuracy_score(y_test,y_predict))

In [24]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.54      0.59      0.56       553
           1       0.65      0.61      0.63       714

    accuracy                           0.60      1267
   macro avg       0.59      0.60      0.59      1267
weighted avg       0.60      0.60      0.60      1267

Confusion Matrix
[[324 229]
 [281 433]]

 Accuracy
0.5974743488555643


In [26]:
y_pred = model.predict(x_val)
printResults(y_val,y_pred)

              precision    recall  f1-score   support

           0       0.60      0.64      0.62       616
           1       0.65      0.60      0.62       668

    accuracy                           0.62      1284
   macro avg       0.62      0.62      0.62      1284
weighted avg       0.62      0.62      0.62      1284

Confusion Matrix
[[395 221]
 [265 403]]

 Accuracy
0.6214953271028038


In [27]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
poly = svm.SVC(kernel='poly', degree=3, C=1)

In [28]:
rbf.fit(x_train,y_train)
y_pred = rbf.predict(x_test)

In [29]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.60      0.20      0.29       553
           1       0.59      0.90      0.71       714

    accuracy                           0.59      1267
   macro avg       0.60      0.55      0.50      1267
weighted avg       0.59      0.59      0.53      1267

Confusion Matrix
[[108 445]
 [ 72 642]]

 Accuracy
0.5919494869771112


In [30]:
poly.fit(x_train,y_train)
y_pred = poly.predict(x_test)

In [31]:
printResults(y_test,y_pred)

              precision    recall  f1-score   support

           0       0.60      0.31      0.41       553
           1       0.61      0.84      0.71       714

    accuracy                           0.61      1267
   macro avg       0.60      0.57      0.56      1267
weighted avg       0.60      0.61      0.58      1267

Confusion Matrix
[[172 381]
 [117 597]]

 Accuracy
0.6069455406471981
