In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import re
import nltk
from nltk.stem import *
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import recall_score # not used
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [2]:
# The training data is in the same directory as the program but not in git
# get the dataset from https://www.kaggle.com/competitions/fake-news/data
raw_news_data = pd.read_csv("train.csv")

In [3]:
# Fill the lack of a certain feature with an empty string

data = raw_news_data.fillna("")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# Next we stem the titles and authors
# For stemming look https://www.nltk.org/howto/stem.html
# Also if pip does not install stopwords, run nltk.download("stopwords")

stemmer = SnowballStemmer("english") # Note the language

def stem_title(title):
    stemmed = re.sub('[^a-zA-Z]',' ', title) # remove special characters
    stemmed_list = stemmed.lower().split() # lower and split
    stemmed_list = [stemmer.stem(i) for i in stemmed_list if not i in stopwords.words("english")] # remove common words
    stemmed_title = " ".join(stemmed_list) # make a string
    return stemmed_title

def stem_author(author):
    stemmed = re.sub('[^a-zA-Z]',' ', author)
    stemmed_author = " ".join(stemmed.lower().split())
    return stemmed_author

data["title"] = data["title"].apply(stem_title)
data["author"] = data["author"].apply(stem_author)
data.head()

Unnamed: 0,id,title,author,text,label
0,0,hous dem aid even see comey letter jason chaff...,darrell lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,flynn hillari clinton big woman campus breitbart,daniel j flynn,Ever get the feeling your life circles the rou...,0
2,2,truth might get fire,consortiumnews com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,civilian kill singl us airstrik identifi,jessica purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,iranian woman jail fiction unpublish stori wom...,howard portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
# Creating the training data by concatting title and author strings, easier to transform 
data["training"] = data["title"] + " " + data["author"]
data.head()

Unnamed: 0,id,title,author,text,label,training
0,0,hous dem aid even see comey letter jason chaff...,darrell lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,hous dem aid even see comey letter jason chaff...
1,1,flynn hillari clinton big woman campus breitbart,daniel j flynn,Ever get the feeling your life circles the rou...,0,flynn hillari clinton big woman campus breitba...
2,2,truth might get fire,consortiumnews com,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fire consortiumnews com
3,3,civilian kill singl us airstrik identifi,jessica purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,civilian kill singl us airstrik identifi jessi...
4,4,iranian woman jail fiction unpublish stori wom...,howard portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jail fiction unpublish stori wom...


In [6]:
# Dividing the features and labels
X = data["training"].values
y = data["label"].values

# Sanity check
X.shape == y.shape

True

In [7]:
vec = TfidfVectorizer() # Vectorize the textual features to a feature vector
vec.fit(X)
X = vec.transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=239)

In [8]:
# Training and predicting using Logistic Regression in stage 1

lr = LogisticRegression(solver="liblinear")
lr.fit(X_train, y_train)
svc = svm.SVC()
svc.fit(X_train, y_train)

In [9]:
# Validate the model using 7-fold cross validation
lr_scores = cross_validate(lr, X_train, y_train, scoring='precision_macro', cv=7,return_estimator=True)
svc_scores = cross_validate(svc, X_train, y_train, scoring='precision_macro', cv=7,return_estimator=True)
print(lr_scores['test_score'])
print(svc_scores['test_score'])
lr_y_pred = lr.predict(X_val)
svc_y_pred = svc.predict(X_val)


print(f"The accuracy of the logistic regression model is {accuracy_score(y_val, lr_y_pred)}")
print(f"The accuracy of the support vector machine model is {accuracy_score(y_val, svc_y_pred)}")

[0.97073808 0.97130583 0.96809511 0.97063757 0.97674193 0.97626366
 0.97984458]
[0.98486329 0.98627748 0.9816024  0.98485481 0.98896858 0.99005195
 0.99244052]
The accuracy of the logistic regression model is 0.9705128205128205
The accuracy of the support vector machine model is 0.9854166666666667


In [10]:
confusion_matrix(y_val, lr_y_pred)
# From the confusion matrix and accuracy_score we can deduce 
# that logistic regression gives good predictions about fake news

array([[3003,  154],
       [  30, 3053]], dtype=int64)

In [11]:
confusion_matrix(y_val, svc_y_pred)

array([[3090,   67],
       [  24, 3059]], dtype=int64)