# NLP Model for fake news detection

In [4]:
# Importing pandas library for importing the dataset into the Python environment
import pandas as pd

In [5]:
data = pd.read_csv(r'D:\Data science and AI\NLP\fakenews.csv')
data

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0
...,...,...
4981,The storybook romance of WWE stars John Cena a...,0
4982,The actor told friends he’s responsible for en...,0
4983,Sarah Hyland is getting real. The Modern Fami...,0
4984,Production has been suspended on the sixth and...,0


In [6]:
# There are two columns one with text and another with label 0 or 1 i.e, real or fake
data.shape

(4986, 2)

In [7]:
data['label'].value_counts()

0    2972
1    2014
Name: label, dtype: int64

In [8]:
# There are 2972 real and 2014 fake news out of the whole dataset

### {0 : 'real', 1 : 'fake'}

### Let's build an NLP model for doing fake news classification

In [9]:
# Importing the required libraries
import nltk
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

In [10]:
data.head() # To know about the datatset

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0


#### Data pre-processing steps for building an NLP model

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\weclome\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\weclome\AppData\Roaming\nltk_data...


True

In [11]:
# Data preprocessing is essential to build an efficient corpus without special characters and irregular spaces

corpus = [] # Define a corpus to build our dataframe for analysis and model building 

for i in range(0,4986):
    
    # To remove special characters
    document = re.sub(r'\W',' ',data['text'][i])
    
    # To remove single characters
    document = re.sub(r'\s+[a-zA-Z]\s+',' ',document)
    
    # To remove single characters from starting
    document = re.sub(r'\^[a-zA-Z]\s+',' ',document)
    
    # To remove more spacs and replace with single space
    document = re.sub(r'\s+',' ',document,flags=re.I)
    
    document = document.lower()
    
    document = document.split()
    
    document = [stemmer.lemmatize(w) for w in document]
    
    document = ' '.join(document)
    
    corpus.append(document)

In [12]:
print(corpus[0])

get the latest from today sign up for our newsletter no one ever truly get over losing loved one and blake shelton is no exception he wa just 14 when his older brother richie died on nov 13 1990 and a shelton noted in tweet monday it changed my life forever richie wa 24 when he died in car accident in the sheltons home state of oklahoma two year ago shelton sent out message for the 25th anniversary of his loss richie who wa blake half brother they shared mother wa passenger in car that collided with school bus in ada south of oklahoma city richie driver redena mcmanus and 3 year old boy christopher mcmanus all died during or shortly after the collision while the bus driver and passenger were uninjured according to police report the accident ha clearly remained with blake who told 60 minute in 2014 remember picking up the phone to call him week after he wa dead to tell him something wa picking up the phone to call him to tell him something just saw on tv or and it wa like constantly sho

In [19]:
# Divding data into features and target variables where X is corpus and y is target (fake or real)
X = corpus
y = data.label

In [20]:
# Dividing the data into train and test
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# Using countvectorizer for the corpus dataframing

count_vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words = stopwords.words('english'))

count_train = count_vectorizer.fit_transform(X_train)

count_test = count_vectorizer.transform(X_test)

In [24]:
print(count_vectorizer.get_feature_names())

['000', '08', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1997', '1998', '1999', '20', '200', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '48', '49', '50', '53', '54', '60', '65', '70', '80', '90', 'abc', 'ability', 'able', 'absolutely', 'abuse', 'academy', 'according', 'account', 'accused', 'across', 'act', 'acting', 'action', 'actor', 'actress', 'actually', 'adam', 'add', 'added', 'adding', 'addition', 'address', 'admitted', 'advertisement', 'affair', 'affleck', 'afp', 'age', 'agent', 'ago', 'agree', 'agreed', 'agreement', 'aguilera', 'ahead', 'air', 'aired', 'al', 'album', 'allegation', 'alleged', 'allegedly', 'allowed', 'almost', 'alone', 'along', 'alongside', 'already', 'also', 'although', 'always', 'ama

In [25]:
pd.DataFrame(count_train.toarray(),columns=count_vectorizer.get_feature_names())

Unnamed: 0,000,08,10,100,11,12,13,14,15,16,...,wrote,yard,yeah,year,yes,yet,york,young,younger,youtube
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,2,0,0,0,0,0,0
1,0,0,2,0,1,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0
2,0,0,5,2,7,18,3,11,14,3,...,1,1,0,41,1,1,4,6,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,2,0,0,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3983,0,0,0,0,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,0
3984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,6,1,1,0,0,0,0
3985,0,0,1,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,1,0,0
3986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [27]:
# Using Tfidfvectorizer for the corpus dataframing

from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf_vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words = stopwords.words('english'))

tfidf_train = tfidf_vectorizer.fit_transform(X_train)

tfidf_test = tfidf_vectorizer.transform(X_test)

In [29]:
print(tfidf_vectorizer.get_feature_names())

['000', '08', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1997', '1998', '1999', '20', '200', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '48', '49', '50', '53', '54', '60', '65', '70', '80', '90', 'abc', 'ability', 'able', 'absolutely', 'abuse', 'academy', 'according', 'account', 'accused', 'across', 'act', 'acting', 'action', 'actor', 'actress', 'actually', 'adam', 'add', 'added', 'adding', 'addition', 'address', 'admitted', 'advertisement', 'affair', 'affleck', 'afp', 'age', 'agent', 'ago', 'agree', 'agreed', 'agreement', 'aguilera', 'ahead', 'air', 'aired', 'al', 'album', 'allegation', 'alleged', 'allegedly', 'allowed', 'almost', 'alone', 'along', 'alongside', 'already', 'also', 'although', 'always', 'ama



In [30]:
pd.DataFrame(tfidf_train.toarray(),columns=count_vectorizer.get_feature_names())

Unnamed: 0,000,08,10,100,11,12,13,14,15,16,...,wrote,yard,yeah,year,yes,yet,york,young,younger,youtube
0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.059688,0.00000,0.0,0.065537,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,0.000000,0.0,0.051924,0.000000,0.029645,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.053386,0.00000,0.0,0.000000,0.000000,0.000000,0.027347,0.000000,0.0,0.000000
2,0.000000,0.0,0.028430,0.015177,0.045448,0.113763,0.019694,0.073232,0.092088,0.020935,...,0.005846,0.01264,0.0,0.131591,0.007575,0.006509,0.023957,0.040386,0.0,0.000000
3,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.0,0.069715,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.012172,0.0,0.000000,0.000000,0.011325,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.0,0.011196,0.000000,0.000000,0.052234,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3983,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.118698,0.000000,...,0.000000,0.00000,0.0,0.115835,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3984,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.0,0.150907,0.059359,0.051010,0.000000,0.000000,0.0,0.000000
3985,0.000000,0.0,0.073182,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.0,0.082616,0.000000,0.000000,0.000000,0.086630,0.0,0.000000
3986,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.0,0.041900,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [31]:
# Importing Naive-Bayes classifier for building the NLP model

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import *
nbclassifier = MultinomialNB()

In [32]:
nbclassifier.fit(count_train,y_train)

MultinomialNB()

In [33]:
y_pred = nbclassifier.predict(count_test)

In [34]:
print(accuracy_score(y_test,y_pred))

0.718436873747495


In [35]:
# By using countvectorizer the model that is built has given an accuracy of 72%.

In [36]:
#Lets analyze the accuracy using confusion matrix
cm = metrics.confusion_matrix(y_pred, y_test)
print(cm)

[[459 144]
 [137 258]]


In [37]:
print((459+258)/(144+137+459+258))

0.718436873747495


In [None]:
# So out of 998 data, there are 459 and 258 predictions where the model predicted correctly whether the news is real or fake.

In [40]:
nbclassifier.fit(tfidf_train,y_train)

MultinomialNB()

In [40]:
y_pred_tfidf = nbclassifier.predict(tfidf_test)

In [41]:
print(accuracy_score(y_test,y_pred_tfidf))

0.7404809619238477


In [42]:
#Lets analyze the accuracy using confusion matrix
cm = metrics.confusion_matrix(y_pred_tfidf, y_test)
print(cm)

[[514 177]
 [ 82 225]]


In [43]:
print((514+225)/(514+225+177+82))

0.7404809619238477


In [45]:
# So out of 998 data, there are 514 and 225 predictions where the model predicted correctly whether the news is real or fake.

In [46]:
# By using tfidfvectorizer the model that is built has given an accuracy of 74%.

### So compared to countevectorizer the tfidfvectorizer is giving the best accuracy for the built NLP model