In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
import nltk.corpus

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import  LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm

from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings("ignore")



In [2]:
# load all three datasets
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

# add header to all three datasets
df_train = pd.read_csv(trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [3]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    return df

In [4]:
df_train = dataPreprocessing(df_train)

In [5]:
df_train.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job Title,State,Party,Barely True Cnt,False Cnt,Half True Cnt,Mostly True Cnt,Pants on Fire Cnt,Context,Justification
0.0,2635,false,says the annies list political group supports ...,abortion,dwayne bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,that s a premise that he fails to back up ann...
1.0,10540,half-true,when did the decline of coal start it started...,energy history job accomplishments,scott surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech,surovell said the decline of coal started whe...
2.0,324,mostly-true,hillary clinton agrees with john mccain by vo...,foreign policy,barack obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver,obama said he would have voted against the ame...
3.0,1123,false,health care reform legislation is likely to ma...,health care,blog posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,the release may have a point that mikulskis co...
4.0,9028,half-true,the economic turnaround started at the end of ...,economy jobs,charlie crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on cnn,crist said that the economic turnaround start...


In [6]:
#Let's start with the S condition

In [7]:
cols_select = ['Label','Statement']
df_train_S = df_train[cols_select]

In [8]:
df_train_S.head()

Unnamed: 0,Label,Statement
0.0,false,says the annies list political group supports ...
1.0,half-true,when did the decline of coal start it started...
2.0,mostly-true,hillary clinton agrees with john mccain by vo...
3.0,false,health care reform legislation is likely to ma...
4.0,half-true,the economic turnaround started at the end of ...


In [9]:
# Transfrom Statement to Unigram tokens
tokenizer = RegexpTokenizer(r'\w+')
df_train_S["Unigrams"] = df_train_S["Statement"].apply(tokenizer.tokenize)
df_train_S.head()

Unnamed: 0,Label,Statement,Unigrams
0.0,false,says the annies list political group supports ...,"[says, the, annies, list, political, group, su..."
1.0,half-true,when did the decline of coal start it started...,"[when, did, the, decline, of, coal, start, it,..."
2.0,mostly-true,hillary clinton agrees with john mccain by vo...,"[hillary, clinton, agrees, with, john, mccain,..."
3.0,false,health care reform legislation is likely to ma...,"[health, care, reform, legislation, is, likely..."
4.0,half-true,the economic turnaround started at the end of ...,"[the, economic, turnaround, started, at, the, ..."


In [10]:
# create vocabulary
allUnigrams = []
for unigrams in df_train_S['Unigrams']:
    for unigram in unigrams:
        allUnigrams.append(unigram)
vocabulary = sorted(list(set(allUnigrams)))
print("Vocabulary Size: "+str(len(vocabulary)))

Vocabulary Size: 12249


In [14]:
# UNCOMMENT to download pretrained word2vec 

# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)
# print(path)

/home/kalit/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
