In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\John
[nltk_data]     Daison\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
#loading the data
train_df = pd.read_csv('fake_news_data.csv')
print(train_df)

       Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1               1                                                NaN   
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  label  
0      No comment is expected from Barack Obama Membe...      1  
1  

In [9]:
train_df.shape

(72134, 4)

In [11]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [13]:
#finding the null values
train_df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [15]:
train_df = train_df.fillna('')

In [17]:
train_df['label'].value_counts()

label
1    37106
0    35028
Name: count, dtype: int64

In [19]:
#seperating data and label
X = train_df['title']
Y = train_df['label']
print(X)
print(Y)

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1                                                         
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: title, Length: 72134, dtype: object
0        1
1        1
2        1
3        0
4        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 72134, dtype: int64


In [21]:
port_stem = PorterStemmer()

In [23]:
def stemming(content):
  stemmed_content = re.sub('^[a-zA-Z]','',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [25]:
train_df['title'] = train_df['title'].apply(stemming)
print(train_df['title'])

0        aw enforc high alert follow threat cop white 9...
1                                                         
2        nbelievable! obama’ attorney gener say charlot...
3        obbi jindal, rais hindu, use stori christian c...
4        atan 2: russia unv imag terrifi new ‘supernuke...
                               ...                        
72129    ussian steal research trump hack u.s. democrat...
72130    watch: giuliani demand democrat apolog trump’ ...
72131          igrant refus leav train refuge camp hungari
72132    rump tussl give unpopular mexican leader much-...
72133            oldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [27]:
X = train_df['title'].values
Y = train_df['label'].values

In [29]:
print(X)
print(Y)

['aw enforc high alert follow threat cop white 9-11bi #blacklivesmatt #fyf911 terrorist [video]'
 ''
 'nbelievable! obama’ attorney gener say charlott rioter “peaceful” protesters…in home state north carolina [video]'
 ... 'igrant refus leav train refuge camp hungari'
 'rump tussl give unpopular mexican leader much-need shot arm'
 'oldman sach endors hillari clinton presid']
[1 1 1 ... 0 0 1]


In [31]:
#converting text to numerical values
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [33]:
print(X)

  (0, 65)	0.4184957343244852
  (0, 1357)	0.28029198808178785
  (0, 2768)	0.3224007143353423
  (0, 3629)	0.29173462450786397
  (0, 6093)	0.2177372893957125
  (0, 8917)	0.2794827852086318
  (0, 10480)	0.2504470552217772
  (0, 10866)	0.4184957343244852
  (0, 12359)	0.23335552617663718
  (0, 26290)	0.21855391676795904
  (0, 26447)	0.22160638537694238
  (0, 28442)	0.11251572901565599
  (0, 29028)	0.17284112080960268
  (2, 2620)	0.249196527783874
  (2, 4704)	0.2674389193715911
  (2, 5044)	0.32793245616478245
  (2, 11046)	0.23633426073326186
  (2, 12567)	0.2353723116724973
  (2, 13379)	0.23342817475346714
  (2, 17693)	0.3755311330191333
  (2, 18123)	0.19947170421288052
  (2, 18424)	0.15719224489672418
  (2, 19994)	0.35319820314687234
  (2, 21191)	0.3107593001974897
  (2, 22745)	0.31838377492067915
  :	:
  (72130, 26997)	0.140989652532695
  (72130, 28796)	0.2710510754570976
  (72131, 4564)	0.3649224358246169
  (72131, 12848)	0.47276794201645
  (72131, 13144)	0.45607567393300946
  (72131, 15272

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [37]:
#Training the model
model = LogisticRegression()
model.fit(X_train, Y_train)

In [39]:
#Evaluation on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9285008751104719


In [41]:
#Evaluation on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9046925902821099


In [43]:
X_new = X_test[4]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [45]:
print(Y_test[4])

0


In [47]:
import pickle

with open('fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [49]:
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [51]:
def load_model_and_vectorizer():
    with open('fake_news_model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)
    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)
    return loaded_model, loaded_vectorizer

In [53]:
loaded_model, loaded_vectorizer = load_model_and_vectorizer()

In [55]:
def preprocess_and_predict(text):
    stemmed_text = stemming(text)
    text_vectorized = loaded_vectorizer.transform([stemmed_text]).toarray()
    prediction = loaded_model.predict(text_vectorized)
    return 'Fake' if prediction[0] == 1 else 'Real'

In [81]:
new_text = train_df['title'][67]
print(preprocess_and_predict(new_text))

Real
