In [7]:
import numpy as np# to manipulate data to numeric
import re# for text searching words
import pandas as pd
from nltk.corpus import stopwords#natural language tool kit. stopwords are words not adding values to article
from nltk.stem.porter import PorterStemmer# uses root word for a particlular word
from sklearn.feature_extraction.text import TfidfVectorizer#used to convert text to feature vectors
from sklearn.model_selection import train_test_split#used to split data into train and test
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [11]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
#loading data set to pandas DF
news=pd.read_csv('train.csv')
news.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [14]:
news.shape




(20800, 5)

In [16]:
news.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [17]:
#replacing NULL values with empty string
news=news.fillna('')

In [18]:
news.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [22]:
#merging the author name and news title
news['content']=news['author']+''+news['title']
print(news['content'])

0        Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1        Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom...
2        Consortiumnews.comWhy the Truth Might Get You ...
3        Jessica Purkiss15 Civilians Killed In Single U...
4        Howard PortnoyIranian woman jailed for fiction...
                               ...                        
20795    Jerome HudsonRapper T.I.: Trump a ’Poster Chil...
20796    Benjamin HoffmanN.F.L. Playoffs: Schedule, Mat...
20797    Michael J. de la Merced and Rachel AbramsMacy’...
20798    Alex AnsaryNATO, Russia To Hold Parallel Exerc...
20799               David SwansonWhat Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [23]:
#separating data labels
x=news.drop(columns='label', axis=1)
y=news['label']

Stemming

In [24]:
#stemming is the procedure of reducing to root words
port_stem=PorterStemmer()


In [26]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)#sub is used to substitute. this removes numerals, punctuations and if present it will be replaced by ""
    stemmed_content=stemmed_content.lower()#converted to lower case letter
    stemmed_content=stemmed_content.split()#split and store in a list
    '''Below is a list compression. This says that for each word in a list of words in 
    stemmed_content return the root word if the word is not in stopwords'''
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    '''all the words are then joined but separated by and empty string'''
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [27]:
#separating the data and label
x=news['content'].values
y=news['label']


In [28]:
print(x)

['Darrell LucusHouse Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'
 'Daniel J. FlynnFLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'
 'Consortiumnews.comWhy the Truth Might Get You Fired' ...
 'Michael J. de la Merced and Rachel AbramsMacy’s Is Said to Receive Takeover Approach by Hudson’s Bay - The New York Times'
 'Alex AnsaryNATO, Russia To Hold Parallel Exercises In Balkans'
 'David SwansonWhat Keeps the F-35 Alive']


In [29]:
#use vectorizer to convert the textual data to numerical data
vectorizer=TfidfVectorizer()#Term frequency inverse doc fre. it counts the number of times a word appears
vectorizer.fit(x)#words here are converted to its respective feature
x=vectorizer.transform(x)


In [30]:
#splitting the data set into training and test data
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.2, random_state=2)

In [32]:
#Training the logisticRegression model
model=LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression()

In [33]:
#EVALUATION OF THE MODEL
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_prediction,y_train)

In [34]:
print('the accuracy of the model is:',training_data_accuracy)

the accuracy of the model is: 0.9720552884615384


In [35]:
x_test_prediction=model.predict(x_test)
testing_data_accuracy=accuracy_score(x_test_prediction,y_test)
print('the accuracy of the model is:',testing_data_accuracy)

the accuracy of the model is: 0.9548076923076924


In [42]:
#Making a predictive system
x_new=x_test[885]
prediction=model.predict(x_new)
#print(prediction)
if prediction[0]==1:
    print('the news is fake')
else:
    print('the news is Legit')

the news is Legit


In [43]:
import pickle 
pickle_out=open('model.pkl','wb')
pickle.dump(model,pickle_out)
pickle_out.close()
