In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
#Checking for missing values
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [5]:
len(df)

20800

In [6]:
#Removing the missing values
df.dropna(inplace= True)

In [8]:
#mapping the label data, 0 means reliable while 1 means unreliable
label_map = {
    0: "Real News",
    1: "Fake News"
}

df['label'] = df['label'].map(label_map)

In [9]:
#Checking for data imbalance of the target variable
df['label'].value_counts()

Real News    10361
Fake News     7924
Name: label, dtype: int64

In [10]:
#How many authors are there
df['author'].nunique()

3838

In [11]:
df['author'].value_counts().tail(100)

Tim O'neil                             1
Emma-Kate Symons                       1
Joyce Lau                              1
The Hill                               1
Robert Pear and Reed Abelson           1
                                      ..
Tyler Hicks                            1
Nick Wingfield and Katie Benner        1
Frances Robles and Timothy Williams    1
Go Trump (UID 43400051)                1
administrator                          1
Name: author, Length: 100, dtype: int64

In [12]:
#Dropping the author feature
df.drop('author', axis=1, inplace=True)

In [13]:
#The id column is of no relevance
df.drop('id', axis=1, inplace=True)

In [14]:
#The title of the news is pretty much explained in the text,so there is no need keeping the title variable
df.drop('title',axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Fake News
1,Ever get the feeling your life circles the rou...,Real News
2,"Why the Truth Might Get You Fired October 29, ...",Fake News
3,Videos 15 Civilians Killed In Single US Airstr...,Fake News
4,Print \nAn Iranian woman has been sentenced to...,Fake News


In [16]:
#Cleaning up the text

#1) Removing Punctuations
#2) Removing extra white spaces which occurs as a result of the initial removal of punctuations
#3) Removing stopwords
#4) Stemming the words which converts the word to its original form

In [17]:
first_text = df['text'].iloc[0]

In [18]:
#REMOVING PUNCUTAIONS AND EXTRA WHITE SPACES
import string
punctuations = string.punctuation.split()

In [19]:
punc = '’'
punctuations.append(punc)

In [20]:
punctuations = ''.join(punctuations)

In [21]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’'

In [22]:
text1 = [char for char in first_text if char not in punctuations]

In [23]:
text1

['H',
 'o',
 'u',
 's',
 'e',
 ' ',
 'D',
 'e',
 'm',
 ' ',
 'A',
 'i',
 'd',
 'e',
 ' ',
 'W',
 'e',
 ' ',
 'D',
 'i',
 'd',
 'n',
 't',
 ' ',
 'E',
 'v',
 'e',
 'n',
 ' ',
 'S',
 'e',
 'e',
 ' ',
 'C',
 'o',
 'm',
 'e',
 'y',
 's',
 ' ',
 'L',
 'e',
 't',
 't',
 'e',
 'r',
 ' ',
 'U',
 'n',
 't',
 'i',
 'l',
 ' ',
 'J',
 'a',
 's',
 'o',
 'n',
 ' ',
 'C',
 'h',
 'a',
 'f',
 'f',
 'e',
 't',
 'z',
 ' ',
 'T',
 'w',
 'e',
 'e',
 't',
 'e',
 'd',
 ' ',
 'I',
 't',
 ' ',
 'B',
 'y',
 ' ',
 'D',
 'a',
 'r',
 'r',
 'e',
 'l',
 'l',
 ' ',
 'L',
 'u',
 'c',
 'u',
 's',
 ' ',
 'o',
 'n',
 ' ',
 'O',
 'c',
 't',
 'o',
 'b',
 'e',
 'r',
 ' ',
 '3',
 '0',
 ' ',
 '2',
 '0',
 '1',
 '6',
 ' ',
 'S',
 'u',
 'b',
 's',
 'c',
 'r',
 'i',
 'b',
 'e',
 ' ',
 'J',
 'a',
 's',
 'o',
 'n',
 ' ',
 'C',
 'h',
 'a',
 'f',
 'f',
 'e',
 't',
 'z',
 ' ',
 'o',
 'n',
 ' ',
 't',
 'h',
 'e',
 ' ',
 's',
 't',
 'u',
 'm',
 'p',
 ' ',
 'i',
 'n',
 ' ',
 'A',
 'm',
 'e',
 'r',
 'i',
 'c',
 'a',
 'n',
 ' ',
 'F',
 'o'

In [24]:
text2 = ''.join(text1)
text2

'House Dem Aide We Didnt Even See Comeys Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30 2016 Subscribe Jason Chaffetz on the stump in American Fork Utah  image courtesy Michael Jolley available under a Creative CommonsBY license \nWith apologies to Keith Olbermann there is no doubt who the Worst Person in The World is this week–FBI Director James Comey But according to a House Democratic aide it looks like we also know who the secondworst person is as well It turns out that when Comey sent his nowinfamous letter announcing that the FBI was looking into emails that may be related to Hillary Clintons email server the ranking Democrats on the relevant committees didnt hear about it from Comey They found out via a tweet from one of the Republican committee chairmen \nAs we now know Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence Judiciary and Oversight committees that his agency was reviewing emails it had recently disc

In [25]:
def cleaning_text(text):
    #Changing all texts into lower case
    text = text.lower()
    #Removing punctuations
    punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’”—'
    text = [char for char in text if char not in punctuations]
    text = ''.join(text)
    #Removing extra white spaces
    text = ' '.join(text.split())
    return text

In [26]:
df['text'] = df['text'].apply(cleaning_text)

In [27]:
df.head()

Unnamed: 0,text,label
0,house dem aide we didnt even see comeys letter...,Fake News
1,ever get the feeling your life circles the rou...,Real News
2,why the truth might get you fired october 29 2...,Fake News
3,videos 15 civilians killed in single us airstr...,Fake News
4,print an iranian woman has been sentenced to s...,Fake News


In [28]:
from nltk.corpus import stopwords

In [29]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [30]:
#Removing punctuations from the stopwords to make it uniform
stopwords = stopwords.words('english')
stopwords = ' '.join(stopwords)
stopwords = [char for char in stopwords if char not in string.punctuation]
stopwords = ''.join(stopwords)
stopwords = stopwords.split()
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'youre',
 'youve',
 'youll',
 'youd',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'shes',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'thatll',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'fe

In [31]:
#The stopwords are the most frequently occuring words as usual
#Stop words add no benefits when building models
def removing_stopwords(text):
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    stopwords = ' '.join(stopwords)
    stopwords = [char for char in stopwords if char not in string.punctuation]
    stopwords = ''.join(stopwords)
    stopwords = stopwords.split()
    text = text.split()
    text = [word for word in text if word not in stopwords]
    text = ' '.join(text)
    return text

In [32]:
df['text'] = df['text'].apply(removing_stopwords)

In [33]:
#STEMMINGOR LEMMATIZATION
def stemming(text):
    from nltk.stem.porter import PorterStemmer
    p_stemmer = PorterStemmer()
    text = text.split()
    text = [p_stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

In [34]:
df['text'] = df['text'].apply(stemming)

In [35]:
#Checking for empty string tweets
blanks = []
for i,lb,text in df.itertuples():
    if type(text) == str:
        if text == '':
            blanks.append(i)
            
blanks

[]

In [36]:
blanks = []
for i,lb,text in df.itertuples():
    if type(text) == str:
        if text.isspace():
            blanks.append(i)
            
blanks

[]

In [37]:
df['label'].value_counts()/len(df)*100

Real News    56.663932
Fake News    43.336068
Name: label, dtype: float64

In [None]:
#MODEL DEVELOPMENT