In [197]:
import pandas as pd
import random 
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# # The following line is needed to show plots inline in notebooks
# %matplotlib inline 
# from sklearn.preprocessing import StandardScaler, LabelBinarizer
# from sklearn.linear_model import LogisticRegression

# from sklearn.model_selection import KFold
# from sklearn.metrics import make_scorer, confusion_matrix
# from sklearn.model_selection import learning_curve
# import seaborn as sns
# sns.set_style('whitegrid')
# import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression
# from sklearn.feature_selection import RFE
# import warnings
# warnings.filterwarnings("ignore")

# # Import the necessary libraries first
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# from sklearn.decomposition import PCA

In [177]:
basePath = os.path.dirname(os.path.abspath("train.json"))
# 0:false, 1:partly true, 2:true
claim = pd.read_json(open(basePath + "/train/train.json", "r", encoding="utf8"))
claim.head(5)

Unnamed: 0,claim,claimant,date,id,label,related_articles
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,0,"[122094, 122580, 130685, 134765]"
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,1,2,"[106868, 127320, 128060]"
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,4,1,"[132130, 132132, 149722]"
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,5,2,"[123254, 123418, 127464]"
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,6,2,"[41099, 89899, 72543, 82644, 95344, 88361]"


In [178]:
txtPath = basePath+"/train/train_articles/"

In [179]:
def appendArticles(articleList, basePath):
    contents = ''
    for articleNumber in articleList:
        f = open(basePath+str(articleNumber)+".txt", "r")
        contents = f.read()+";"+contents
        f.close()
    return contents

In [180]:
claim['articleText'] = claim.apply(lambda row: appendArticles(row['related_articles'], txtPath) ,axis=1)

In [181]:
def assignLength(row, colName):
    return len(row[colName])

In [183]:
claim['articleLength'] = claim.apply(lambda row: assignLength(row, 'articleText'), axis=1)
claim.head(3)

Unnamed: 0,claim,claimant,date,id,label,related_articles,articleText,articleLength
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,0,"[122094, 122580, 130685, 134765]",1984 by George Orwell\n1984 is a dystopian nov...,7043
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,1,2,"[106868, 127320, 128060]",Maine candidate apologizes after calling Parkl...,9447
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,4,1,"[132130, 132132, 149722]",About Kennedy Space Center Visitor Complex\nDe...,16891


In [184]:
y = claim['label']
# Drop the `label` column
claim = claim.drop("label", axis=1)
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(claim['articleText'], y, test_size=0.33, random_state=53)

In [185]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [186]:
# Get the feature names of `tfidf_vectorizer` 
print(count_vectorizer.get_feature_names()[-10:])

['ﻼد', 'ﻼﺗﻪ', '𝑩𝒓𝒂𝒕𝒎𝒂𝒏', '𝑫𝒐𝒖𝒈', '𝔸𝕡𝕖𝕝', '𝕁ℙ', '𝕄𝕔𝔾𝕝𝕠𝕟𝕖', '𝕋𝕙𝕖𝕣𝕖𝕤𝕖', '𝖑𝖎𝖑𝖆', '𝖗𝖔𝖘𝖎𝖊']


In [187]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [188]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

['ﻼد', 'ﻼﺗﻪ', '𝑩𝒓𝒂𝒕𝒎𝒂𝒏', '𝑫𝒐𝒖𝒈', '𝔸𝕡𝕖𝕝', '𝕁ℙ', '𝕄𝕔𝔾𝕝𝕠𝕟𝕖', '𝕋𝕙𝕖𝕣𝕖𝕤𝕖', '𝖑𝖎𝖑𝖆', '𝖗𝖔𝖘𝖎𝖊']


In [None]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [None]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [None]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

In [None]:
print(count_df.equals(tfidf_df))

Bag of words feature

In [201]:
clf = MultinomialNB() 

In [203]:
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.580


tf-idf features

In [193]:
clf = MultinomialNB() 

In [200]:
clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.590
