In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
import nltk.corpus

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import  LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm

import gensim
from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import category_encoders as ce

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!pip3 install --upgrade gensim==3.8.3
#!pip3 install category_encoders

In [3]:
# load all three datasets
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [4]:
df_train["train-test-val"] = 0
df_test["train-test-val"] = 1
df_validation["train-test-val"] = 2

In [5]:
df_all = pd.concat([df_train,df_test,df_validation]).reset_index(drop=True)

In [6]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    return df

In [7]:
df_all = dataPreprocessing(df_all)

In [8]:
df_y = df_all[['Label','train-test-val']]
encoder= ce.OrdinalEncoder(cols=['Label'],return_df=True,
                           mapping=[{'col':'Label',
'mapping':{'pants-fire':0,'false':1,'mostly-false':2,'half-true':3,'mostly-true':4,'true':5}}])
df_y = encoder.fit_transform(df_y)

In [9]:
df = df_all[['Label','Statement','train-test-val']]

In [10]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df["Unigrams"] = df["Statement"].apply(tokenizer.tokenize)

In [11]:
# create vocabulary
allUnigrams = []
for unigrams in df['Unigrams']:
    for unigram in unigrams:
        allUnigrams.append(unigram)
vocabulary = sorted(list(set(allUnigrams)))
print("Vocabulary Size: "+str(len(vocabulary)))

Vocabulary Size: 13572


In [12]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

In [13]:
word2vec_path = '/home/kalit/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [14]:
def get_word2vec(unigrams, generate_missing=False, k=300):
    if len(unigrams)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [model[word] if word in model else np.random.rand(k) for word in unigrams]
    else:
        vectorized = [model[word] if word in model else np.zeros(k) for word in unigrams]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(df, generate_missing=False):
    embeddings = df['Unigrams'].apply(lambda x: get_word2vec(x,generate_missing=generate_missing))
    return list(embeddings)

In [15]:
embeddings = get_word2vec_embeddings(df)

In [16]:
df_embedded_words = pd.DataFrame.from_records(embeddings) 

In [17]:
df_embedded_words["train-test-val"] = df["train-test-val"]

In [18]:
x_train = df_embedded_words[df_embedded_words['train-test-val']==0]
x_test = df_embedded_words[df_embedded_words['train-test-val']==1]
x_val = df_embedded_words[df_embedded_words['train-test-val']==2]

x_train.drop(['train-test-val'], axis = 1, inplace = True) 
x_test.drop(['train-test-val'], axis = 1, inplace = True)
x_val.drop(['train-test-val'], axis = 1, inplace = True)

y_train = df_y[df_y['train-test-val']==0]
y_test = df_y[df_y['train-test-val']==1]
y_val = df_y[df_y['train-test-val']==2]

y_train.drop(['train-test-val'], axis = 1, inplace = True)
y_test.drop(['train-test-val'], axis = 1, inplace = True)
y_val.drop(['train-test-val'], axis = 1, inplace = True)

In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=13)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')    
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [21]:
accuracy, precision, recall, F1 = get_metrics(y_test, y_predict)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, F1 = %.3f" % (accuracy, precision,recall, F1))

accuracy = 0.168, precision = 0.184, recall = 0.168, F1 = 0.172
