In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix
import seaborn as sns

In [2]:
fake = pd.read_csv(r"D:\Visual Studio Code\ML\DataSet\Fake_Real_news_Dataset\Fake.csv")
fake['Label'] = 0
true = pd.read_csv(r"D:\Visual Studio Code\ML\DataSet\Fake_Real_news_Dataset\True.csv")
true['Label'] =1
#1 is True[True news] and 0 is False[Fake news]
data = pd.concat([fake,true]).reset_index(drop=True)
data

Unnamed: 0,title,text,subject,date,Label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#Printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
#Data Pre-processing 
data = pd.DataFrame(data)
data.shape

(44898, 5)

In [6]:
data.head()

Unnamed: 0,title,text,subject,date,Label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [7]:
#Check if the data is missing or not in dataset
data.isnull().sum()

title      0
text       0
subject    0
date       0
Label      0
dtype: int64

In [8]:
x = data.drop('Label', axis=1)
y=data['Label']

# Stemming Process:
Stemming is the process of reducing a word to its Root words.

In [9]:
port_stem = PorterStemmer()

In [10]:

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [11]:
data['title'] = data['title'].apply(stemming)

In [12]:
data['text'] = data['text'].apply(stemming)

In [13]:
data.head()

Unnamed: 0,title,text,subject,date,Label
0,donald trump send embarrass new year eve messa...,donald trump wish american happi new year leav...,News,"December 31, 2017",0
1,drunk brag trump staffer start russian collus ...,hous intellig committe chairman devin nune go ...,News,"December 31, 2017",0
2,sheriff david clark becom internet joke threat...,friday reveal former milwauke sheriff david cl...,News,"December 30, 2017",0
3,trump obsess even obama name code websit imag,christma day donald trump announc would back w...,News,"December 29, 2017",0
4,pope franci call donald trump christma speech,pope franci use annual christma day messag reb...,News,"December 25, 2017",0


In [14]:
# Separating the data for training and testing
x = data['text'].values
y= data['Label'].values


In [15]:
#Converting textual data into numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

x=vectorizer.transform(x)

In [16]:
print(x)

  (0, 87706)	0.26875959408413413
  (0, 87629)	0.08671473106081941
  (0, 86560)	0.0319484475591679
  (0, 86404)	0.02738199782160357
  (0, 86322)	0.018939929076279394
  (0, 86308)	0.027435211825342014
  (0, 86226)	0.034741716274265345
  (0, 85904)	0.18705126109096712
  (0, 85031)	0.02679012973879564
  (0, 85020)	0.0900274557950121
  (0, 85009)	0.061097164874937555
  (0, 84981)	0.0900274557950121
  (0, 84337)	0.037638234151918894
  (0, 83779)	0.021542604780305785
  (0, 82124)	0.03793570874484206
  (0, 80070)	0.022984844175275566
  (0, 80010)	0.11263483767246102
  (0, 79339)	0.12538799173630044
  (0, 77345)	0.020703604940166268
  (0, 76970)	0.0900274557950121
  (0, 76916)	0.05692362529071868
  (0, 76652)	0.02664916985705388
  (0, 76408)	0.045266663685727536
  (0, 75768)	0.02235804168683324
  (0, 75746)	0.08671473106081941
  :	:
  (44897, 16719)	0.048107765810988634
  (44897, 16663)	0.04303368840542599
  (44897, 15780)	0.05943548547818958
  (44897, 15715)	0.02931897924359416
  (44897, 15417

In [17]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2,stratify=y)

In [18]:
model= LogisticRegression()
model.fit(x_train,y_train)

In [19]:
y_train_pred = model.predict(x_train)
accuracy = accuracy_score(y_train_pred,y_train)
print("Accuracy of the model on train data: ",accuracy)

Accuracy of the model on train data:  0.9907288824544797


In [20]:
y_test_pred = model.predict(x_test)
accuracy = accuracy_score(y_test_pred,y_test)
print("Accuracy of the model on test data : ",accuracy)

Accuracy of the model on test data :  0.984966592427617


In [21]:
n =320
X_new  = x_test[n]
actual_data = y_test[n]
prediction = model.predict(X_new)
print(prediction)
print("Actual Data is : ")
if (actual_data== 0):
    print("News is False")
else:
    print("News is Real")

print("\nPredicted Data: ")
if (prediction[0]== 0):
    print("News is False")
else:
    print("News is Real")

[0]
Actual Data is : 
News is False

Predicted Data: 
News is False


In [22]:
#Confustion matrix
cm = confusion_matrix(y_train , y_train_pred)
print(cm)

[[18590   195]
 [  138 16995]]
