In [406]:
# Importing all the necessary libraries

import spacy as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing.text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [248]:
# Load the spacy's large english model

nlp = spacy.load('en_core_web_sm')

<IPython.core.display.Javascript object>

### There are two different dataframes. It is necessary to work on these two datframes independently before merging them

In [407]:
# Loading the fake dataframe

fake = pd.read_csv('Dataset/Fake.csv')
fake.head()

  fake = pd.read_csv('Dataset/Fake.csv')


Unnamed: 0,title,text,subject,date,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,31-Dec-17,,,,,,,...,,,,,,,,,,
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,31-Dec-17,,,,,,,...,,,,,,,,,,
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,30-Dec-17,,,,,,,...,,,,,,,,,,
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,29-Dec-17,,,,,,,...,,,,,,,,,,
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,25-Dec-17,,,,,,,...,,,,,,,,,,


In [408]:
# There are quite a lot of 'unnamed' columns

fake.columns

Index(['title', 'text', 'subject', 'date', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       ...
       'Unnamed: 162', 'Unnamed: 163', 'Unnamed: 164', 'Unnamed: 165',
       'Unnamed: 166', 'Unnamed: 167', 'Unnamed: 168', 'Unnamed: 169',
       'Unnamed: 170', 'Unnamed: 171'],
      dtype='object', length=172)

In [409]:
# Narrowing down the columns to what we need

fake = fake[['title', 'text', 'subject', 'date']]
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,31-Dec-17
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,31-Dec-17
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,30-Dec-17
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,29-Dec-17
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,25-Dec-17


In [410]:
# Chcek the value counts of the subject column to get a good insight about the value  distributions

fake.subject.value_counts()

News                                                                                                                                                                                                                                                                                            9050
politics                                                                                                                                                                                                                                                                                        6838
left-news                                                                                                                                                                                                                                                                                       4457
Government News                                                                                                          

In [411]:
# Loading the second dataframe

true = pd.read_csv('Dataset/True.csv')
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [412]:
# Checking the value counts of the subject column

true.subject.value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [413]:
# What are the shapes of the two dataframes? Let's check it out

true.shape, fake.shape

((21417, 4), (23502, 4))

In [414]:
# Checking for missing values for  true dataframe

true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [415]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [416]:
# Checking for missing values for  fake dataframe

fake.isnull().sum()

title       0
text        0
subject    21
date       21
dtype: int64

In [417]:
# The subject colum of the fake dataframe contains a lot of irrelevant values. Let us limit the values to these few

fake = fake.loc[fake['subject'].isin (['News', 'politics', 'left-news', 'Government News', 'US_News', 'Middle-east'])]

In [418]:
# Dropping missing values

fake.dropna(inplace=True)

In [419]:
fake.subject.value_counts()

News               9050
politics           6838
left-news          4457
Government News    1570
US_News             775
Middle-east         770
Name: subject, dtype: int64

In [420]:
fake.shape, true.shape

((23460, 4), (21417, 4))

In [421]:
# Replacing the values of the subject column with 1, indicating fake news

fake['subject'] = '1'

In [422]:
fake.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,31-Dec-17
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,31-Dec-17
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,30-Dec-17
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,29-Dec-17
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,25-Dec-17
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,1,25-Dec-17
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,1,23-Dec-17
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,1,23-Dec-17
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,1,22-Dec-17
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,1,21-Dec-17


In [423]:
# Replacing the values of the subject column with 1, indicating non-fake news

true['subject'] = 0

In [424]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,"December 29, 2017"


In [427]:
# Merging the two dataframes

df = pd.concat([fake.reset_index(drop=True), true.reset_index(drop=True)], axis=0)
df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,31-Dec-17
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,31-Dec-17
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,30-Dec-17
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,29-Dec-17
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,25-Dec-17


In [428]:
df.shape

(44877, 4)

In [429]:
df.subject.value_counts()

1    23460
0    21417
Name: subject, dtype: int64

In [430]:
# Dropping the title and data columns

df = df[['text', 'subject']]
df.head()

Unnamed: 0,text,subject
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [431]:
# Checking for missing values and duplicates
df.isnull().sum()

text       0
subject    0
dtype: int64

In [432]:
# Checking for duplicates

df.duplicated().sum()

6241

In [433]:
# Drop duplicates
df = df.drop_duplicates()

In [434]:
df.duplicated().sum()

0

In [435]:
# Convert the datatype of the subject column
df['subject'] = df['subject'].astype(int)

In [436]:
df.dtypes

text       object
subject     int32
dtype: object

In [437]:
df.text[:15][10]

"A centerpiece of Donald Trump s campaign, and now his presidency, has been his white supremacist ways. That is why so many of the public feuds he gets into involve people of color. One of his favorite targets, is, of course, the players in the National Football League who dare to exercise their First Amendment rights by kneeling during the national anthem in protest of racist police brutality. Well, there is one person who has figured out that racism is bad for business, even if it did get the orange overlord elected: The founder of the pizza chain Papa John s.This is a man who has never been on the right side of history on any number of issues, and plus his pizza sucks. But, when he decided to complain about the players protesting, his sales really dropped. Turns out racism doesn t pay, and we all know that corporations are all about the bottom line. Therefore, Papa John Schnatter will no longer be CEO of the hack pizza chain.BREAKING: Papa John's founder John Schnatter to step down 

In [438]:
# Separating the features and target variables

features = df['text']
target = df['subject']

In [439]:
target.value_counts()

0    21192
1    17444
Name: subject, dtype: int64

In [440]:
# The vectors are in different ranges, positive and negative values. They have to be scaled
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=12, stratify=target)

In [441]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30908,), (30908,), (7728,), (7728,))

In [442]:
# Importing a pipeline

from sklearn.pipeline import Pipeline

clf = Pipeline([('tfidf', TfidfVectorizer()),
                ('naive_bayes', MultinomialNB())
                ])

clf.fit(X_train, y_train)

### Model Evaluation

In [443]:
y_pred = clf.predict(X_test)

In [444]:
# Calculating the evaluation

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      4239
           1       0.96      0.90      0.93      3489

    accuracy                           0.94      7728
   macro avg       0.94      0.93      0.94      7728
weighted avg       0.94      0.94      0.94      7728



### Prediction System

In [448]:
news_feed = ['It has been proven that COVID-19 came with flood and valcanic eruptions',
             'The great wall of China is actually located i China',
             'There has been too many cases of flood and earthquakes in recent times',
             'Christmas day is May 29th',
             ' Nigeria is a country in Europe'
            ]

In [449]:
# Make predictions
predictions = clf.predict(news_feed)

# Print the predictions
for sentence, prediction in zip(news_feed, predictions):
    print(f"Sentence: {sentence}")
    print(f"Predicted Label: {prediction}")
    print()





Sentence: It has been proven that COVID-19 came with flood and valcanic eruptions
Predicted Label: 1

Sentence: The great wall of China is actually located i China
Predicted Label: 0

Sentence: There has been too many cases of flood and earthquakes in recent times
Predicted Label: 0

Sentence: Christmas day is May 29th
Predicted Label: 1

Sentence:  Nigeria is a country in Europe
Predicted Label: 0



### Saving the model

In [447]:
import pickle

filename = 'fake_news_class.pkl'
with open(filename, 'wb') as file:
    pickle.dump(clf, file)
    
print('Model is saved successfully!')

Model is saved successfully!
