# Fake News Prediction By: Mridul Gulati

## Import the Dependecies

In [1]:
import numpy as np
import pandas as pd
import re  #Regular Expression
from nltk.corpus import stopwords  #filter out filler words(stopwords)
from nltk.stem.porter import PorterStemmer   #Used to stem Root words
from sklearn.feature_extraction.text import TfidfVectorizer    #Create text as feature vectors
from sklearn.model_selection import train_test_split       #To split data into train and test
from sklearn.linear_model import LogisticRegression     #Binary classification
from sklearn.metrics import accuracy_score    #For evaluation of accuracy

In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/mg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords.words("english") #To be removed

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## About the Dataset
* id: unique id for a news article
* title: the title of a news article
* author: author of the news article
* text: the text of the article; could be incomplete
* label: a label that marks the article as potentially unreliable
* 1: unreliable
* 0: reliable

## Load the Dataset

In [4]:
news_data = pd.read_csv("fake-news-dataset/train.csv")

In [5]:
news_data.drop(columns = "id", axis = 1, inplace=True)

In [6]:
news_data

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


### Data Cleaning

In [7]:
news_data.shape

(20800, 4)

In [8]:
news_data.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
clean = news_data.fillna('')

### Merging Author and Text

In [10]:
clean["Content"] = clean["author"] + ' ' + clean["text"]

In [11]:
clean

Unnamed: 0,title,author,text,label,Content
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Daniel J. Flynn Ever get the feeling your life...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss Videos 15 Civilians Killed In ...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Print \nAn Iranian woman has be...
...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Jerome Hudson Rapper T. I. unloaded on black c...
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,Benjamin Hoffman When the Green Bay Packers lo...
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Michael J. de la Merced and Rachel Abrams The ...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"Alex Ansary NATO, Russia To Hold Parallel Exer..."


## Stemming
Reducing words to their root word
eg: actor, actress, acting -> act

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):
    stemmed = re.sub('[^a-zA-Z]',' ', content) # Exclude a-z or A-Z and substitute rest with ' '
    stemmed = stemmed.lower()
    stemmed = stemmed.split() # Convert into list
    stemmed = [port_stem.stem(word) for word in stemmed if not word in stopwords.words("english")] # Filter stopwords
    stemmed = ' '.join(stemmed) # Convert to string again
    return stemmed

In [14]:
clean["Content"] = clean["Content"].apply(stemming)

In [15]:
clean["Content"]

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn ever get feel life circl rounda...
2        consortiumnew com truth might get fire octob t...
3        jessica purkiss video civilian kill singl us a...
4        howard portnoy print iranian woman sentenc six...
                               ...                        
20795    jerom hudson rapper unload black celebr met do...
20796    benjamin hoffman green bay packer lost washing...
20797    michael j de la merc rachel abram maci today g...
20798    alex ansari nato russia hold parallel exercis ...
20799    david swanson david swanson author activist jo...
Name: Content, Length: 20800, dtype: object

## Features(X) and Label(Y) separation

In [16]:
X = clean["Content"].values
Y = clean["label"].values

In [17]:
X

array(['darrel lucu hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know anthoni weiner sext teenag appar littl t

In [18]:
Y

array([1, 0, 1, ..., 0, 1, 1])

### Convert Text to Numbers

In [19]:
vectorizer = TfidfVectorizer() # Assigns importance value according to frequency of a particular word
vectorizer.fit(X)
X = vectorizer.transform(X)

## Splitting Train and Test Data

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2, stratify = Y)

## Train the Model: Logistic Regression

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X_train, Y_train)

## Evaluation

In [23]:
X_train_prediction = model.predict(X_train)
train_accuracy = accuracy_score(X_train_prediction, Y_train)

In [24]:
train_accuracy

0.9764423076923077

In [25]:
X_test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)

In [26]:
test_accuracy

0.9454326923076923

## Exporting the Model for Deployment

In [27]:
import pickle

In [28]:
pickle.dump(model, open("model.pkl", 'wb'))
pickle.dump(vectorizer, open("vector.pkl", 'wb'))