In [1]:
# importing the dependancies
import numpy as np          # for making numpy arrays
import pandas as pd         # for creating the dataframes and storing the data in the data frame
import re                   # reg exp for searching words in text or paragraphs
from nltk.corpus import stopwords           # stopwords - those words which doesn't add much value to a paragraph vertex eg. a, an, were
from nltk.stem.porter import PorterStemmer  # stemming - takes a word and removes prefix and suffix of the word and returns the root word
from sklearn.feature_extraction.text import TfidfVectorizer # which is used to convert text into feature vectors
from sklearn.model_selection import train_test_split        # used to split our data set into train data and test data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bonny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# these words doesn't have much value in our datasets, so we remove them
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# II. Data Pre-processing

# Loading the dataset to a pandas Dataframe
news_dataset = pd.read_csv('D:\\DataFlair\\train.csv')

In [5]:
# Returns the number of columns and rows
news_dataset.shape

(20800, 5)

In [6]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# we need to check whether some values are missing in this datasets
# lets count the number of missing values in dataset

news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# so, since we have a large dataset we can drop the null rows
# but if we have a small dataset we cant drop them, 
# so we replace the missing values with null strings

# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [9]:
# For our prediction we are going to include title, author and text
# We could only do it with title and author,
# but using more data is gonna give us a good accuracy score

# merging the author, title and text in content column
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']+' '+news_dataset['text']

In [10]:
print(news_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799    David Swanson What Keeps the F-35 Alive   Davi...
Name: content, Length: 20800, dtype: object


In [11]:
# separating the data and label
x = news_dataset.drop(columns='label', axis=1) # axis = 1, tells its a column
y = news_dataset['label']

In [12]:
print(x)
print(y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [13]:
# III. Stemming
# The process of reducing a word to its root word.
# remove prefix and suffix of a word
# eg. actor, actress, acting --> act

port_stem = PorterStemmer()

In [14]:
def stemming(content):
    # in our dataset there are numbers, commas, we dont want that we just want to alphabets, so remove non alphabets
    stemmed_content = re.sub(r'[^a-zA-Z]', ' ', content)  # sub - substitute certain value, ^ - means exclusion, so exclude everything that is not alphabet
    # changing to lowercase b/c upper case sometimes have other mean some significant things
    stemmed_content = stemmed_content.lower()
    # convert to lists
    stemmed_content = stemmed_content.split()
    # stem all words in the list, and removing stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
# apply the above function to our content column

news_dataset['content'] = news_dataset['content'].apply(stemming)

In [16]:
print(news_dataset['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2        consortiumnew com truth might get fire truth m...
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799    david swanson keep f aliv david swanson author...
Name: content, Length: 20800, dtype: object


In [17]:
# separating data and label
x = news_dataset['content'].values
y = news_dataset['label'].values

In [18]:
print(x)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel 

In [19]:
print(y)

[1 0 1 ... 0 1 1]


In [20]:
# So we feed x and y to our model
# So we need to convert our word into meaningful numbers
# Converting the textual data to numeric data

# it basically counts the number of times a particular word is repeating in a document
# Tf  (Term Frequency) - The number of times a word appears in a document is its Term Frequency. A higher value means a term appears more often than others, and so, the document is a good match when the term is part of the search terms
# Idf (Inverse Document Frequency) - Words that occur many times a document, but also occur many times in many others, may be irrelevant. IDF is a measure of how significant a term is in the entire corpus.
#       eg. we're reviewing avengers movie - all the review contains the word avenger in it, but the word doesn't have meaning 
#       So - idf - finds those words which are repeating so many times and it detects that those words are not significant and it reduces its importance value
# By doing this they create feature vectors
vectorizer = TfidfVectorizer() 
vectorizer.fit(x)

x = vectorizer.transform(x)

In [21]:
print(x)

  (0, 109752)	0.049158312425168854
  (0, 109697)	0.0190646711515277
  (0, 108742)	0.04416544119908134
  (0, 108738)	0.09477494042884232
  (0, 108695)	0.03758488097939004
  (0, 108658)	0.01130614774071694
  (0, 108007)	0.017092546683505856
  (0, 107190)	0.017105936674103112
  (0, 107099)	0.012543234221230963
  (0, 107013)	0.029126417104928328
  (0, 106934)	0.012863319680563097
  (0, 106734)	0.011771716334271506
  (0, 105884)	0.025727197929110487
  (0, 105848)	0.031296701378124764
  (0, 104837)	0.02153649554212262
  (0, 103422)	0.06544555398259812
  (0, 102736)	0.03314918847150756
  (0, 102485)	0.01639612818098454
  (0, 101717)	0.038071924979380216
  (0, 101077)	0.011082403436475742
  (0, 101067)	0.0432044670628921
  (0, 101014)	0.13602128375819167
  (0, 100866)	0.0713092337063475
  (0, 99577)	0.03944988916619374
  (0, 99009)	0.027120358929731154
  :	:
  (20799, 7470)	0.010635431711878486
  (20799, 7143)	0.02816704434978389
  (20799, 6848)	0.03959171777516513
  (20799, 6810)	0.0253655855

In [22]:
# Now we feed this data to our machine learning model

# IV. Splitting dataset into training and test data

# test_size = 0.2 -> I want 80% of the data to be training data and 20% of it to be testing data
# so the label for x_train data will be stored in y_train, same for test
# stratify = y -> to seggregate the new as the original dataset based on the y value
# random_state=2 -> to reproduce a particular form, it can be any integer value
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=2)

In [23]:
# V. Training The Model: Logistic Regression Model

model = LogisticRegression()

In [24]:
# this will plot the sigmoid function curve using the logistic regression
model.fit(x_train, y_train)

LogisticRegression()

In [25]:
# VI. Evaluation

# finding its accuracy score
# Now the model will be asked to predict values and the model's prediction will be compared to original label values

# accuracy socre on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [26]:
print(f'Accuracy Score of Training Data: {training_data_accuracy}')

Accuracy Score of Training Data: 0.9798677884615384


In [27]:
# The accuracy score on training data is not much important but on the test data is important
# because our model is trained on training data but our model hasn't seen what is the test data.
# So, the accuracy score on test data will tell us how good our model is performing
# eg. exams - when a question you were preparing for comes, that is training data, and test data is the questions you haven't seen

x_test_prediction = model.predict(x_test)
x_test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [28]:
print(x_test_data_accuracy)

0.9543269230769231


In [36]:
# VII. Making a Predictive System

x_new = x_test[5]
prediction = model.predict(x_new)
print(prediction)

if y_test[5] == 0:
    print("The News is Real.")
else:
    print("The News is Fake.")
if prediction[0] == 0:
    print('The Predicted News is Real.')
else:
    print('The Predicted News is Fake.')

[1]
The News is Fake.
The Predicted News is Fake.
