In [64]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
%matplotlib inline

In [30]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Data Pre-processing

In [32]:
## Collecting the dataset

fakeNews_data=pd.read_csv('FakeNewsNet.csv')
fakeNews_data.shape


(23196, 5)

In [33]:
fakeNews_data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [34]:
fakeNews_data.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [35]:
# replacing the null values with empty data string

In [36]:
new_dataset=fakeNews_data.fillna('')
new_dataset.shape

(23196, 5)

In [37]:
new_dataset.isnull().sum()

title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64

In [38]:
# merging the dataset

In [39]:
new_dataset['content']=new_dataset['title']+new_dataset['source_domain']

In [40]:
new_dataset['content']

0        Kandi Burruss Explodes Over Rape Accusation on...
1        People's Choice Awards 2018: The best red carp...
2        Sophia Bush Sends Sweet Birthday Message to 'O...
3        Colombian singer Maluma sparks rumours of inap...
4        Gossip Girl 10 Years Later: How Upper East Sid...
                               ...                        
23191    Pippa Middleton wedding: In case you missed it...
23192    Zayn Malik & Gigi Hadid’s Shocking Split: Why ...
23193    Jessica Chastain Recalls the Moment Her Mother...
23194    Tristan Thompson Feels "Dumped" After Khloé Ka...
23195    Kelly Clarkson Performs a Medley of Kendrick L...
Name: content, Length: 23196, dtype: object

In [41]:
## separating the data and the label 
X=new_dataset.drop(columns='real',axis=1)
y=new_dataset['real']


In [42]:
y

0        1
1        1
2        1
3        1
4        1
        ..
23191    1
23192    0
23193    1
23194    0
23195    1
Name: real, Length: 23196, dtype: int64

Stemming:

Stemming is a process of reducing its word to its Root word.

example:

Actors acting actress --> act is a root word

In [43]:
port_stem=PorterStemmer()

In [44]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [45]:
new_dataset['content']=new_dataset['content'].apply(stemming)

In [46]:
new_dataset['content']

0        kandi burruss explod rape accus real housew at...
1        peopl choic award best red carpet lookswww tod...
2        sophia bush send sweet birthday messag one tre...
3        colombian singer maluma spark rumour inappropr...
4        gossip girl year later upper east sider shock ...
                               ...                        
23191    pippa middleton wed case miss pippa marri lace...
23192    zayn malik gigi hadid shock split chanc reunit...
23193    jessica chastain recal moment mother boyfriend...
23194    tristan thompson feel dump khlo kardashian ref...
23195    kelli clarkson perform medley kendrick lamar h...
Name: content, Length: 23196, dtype: object

In [47]:
# separating the data and the label

X=new_dataset['content'].values
y=new_dataset['real'].values

In [51]:
# converting textual data to numerical data

vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [54]:
print(X)

  (0, 20735)	0.22547067103047763
  (0, 19718)	0.3196006397229684
  (0, 16028)	0.2668981643493908
  (0, 15545)	0.22547067103047763
  (0, 15469)	0.29728546630859554
  (0, 10163)	0.3769950736937001
  (0, 9067)	0.2573298311146081
  (0, 6475)	0.3546799002793272
  (0, 3841)	0.04944921274586644
  (0, 2661)	0.3769950736937001
  (0, 1076)	0.3038115146600151
  (0, 102)	0.25603593416539544
  (1, 19660)	0.3305494690976824
  (1, 15647)	0.334944629545033
  (1, 14283)	0.28543182293885067
  (1, 11279)	0.4860091322682343
  (1, 3841)	0.0724946217871283
  (1, 3464)	0.37773997199945114
  (1, 2973)	0.36313707108189985
  (1, 1780)	0.30177156263264854
  (1, 1188)	0.2999922938709716
  (2, 21662)	0.1081039175452419
  (2, 19940)	0.3104001209402146
  (2, 18921)	0.23158939779943197
  (2, 18310)	0.14838887949237048
  :	:
  (23194, 19478)	0.27142137549565076
  (23194, 15705)	0.3148848636055635
  (23194, 12675)	0.28057974365061616
  (23194, 10962)	0.3042075697429591
  (23194, 10630)	0.26473048593742227
  (23194, 103

In [55]:
## Splitting dataset to training and test data

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)


## Training the model

In [61]:
model=LogisticRegression()
model

In [62]:
model.fit(X_train,y_train)

In [63]:
## Evaluation

news_prediction=model.predict(X_test)
news_accuracy=accuracy_score(news_prediction,y_test)
print("Accuracy ",news_accuracy)


Accuracy  0.846551724137931


In [73]:
# Prediction Value

X_news=X_test[10]
prediction=model.predict(X_news)
if(prediction[0]==0):
    print("The news is real")

else:
    print("The news is fake")


The news is real


In [74]:
print(y_test[10])

0


In [71]:
# Pickle file

pickle.dump(model,open('news_test.pkl','wb'))
pickled_model=pickle.load(open('news_test.pkl','rb'))
pickled_model.predict(input_data)


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.