Importing the dependencies

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mithranes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Data Pre-processing

In [3]:
# Loading the dataset to pandas DF
fake_df = pd.read_csv('Dataset/fake.csv')
true_df = pd.read_csv('Dataset/True.csv')

In [4]:
# Add the label to each dataframe
fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'


In [5]:
# Merge the dataframe
news_df = pd.concat([fake_df, true_df], axis=0)

In [6]:
# adding numerical labels
news_df['label_num'] = news_df['label'].map({'FAKE': 0, 'REAL': 1})

In [7]:
# Dropping the Label column
news_df = news_df.drop(columns=['label'])

In [8]:
news_df.head()

Unnamed: 0,title,text,subject,date,label_num
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [9]:
news_df.isnull().sum()

title        0
text         0
subject      0
date         0
label_num    0
dtype: int64

In [10]:
# count label_num values
news_df['label_num'].value_counts()

label_num
0    23481
1    21417
Name: count, dtype: int64

In [12]:
# Drop label_num column in X and separate y
X = news_df.drop(columns=['label_num'])
y = news_df['label_num']

In [14]:
print(X)


                                                   title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
0      Donald Trump just couldn t wish all Americans ...       News   
1      House Intelligence Committee Chairman Devin Nu...       News   
2      On Friday, it was revealed that former Milwauk...       New

In [15]:
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
21412    1
21413    1
21414    1
21415    1
21416    1
Name: label_num, Length: 44898, dtype: int64


Stemming:

Process of reducing a word to its root word

In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [18]:
news_df['text'] = news_df['text'].apply(stemming)

In [19]:
news_df.head()

Unnamed: 0,title,text,subject,date,label_num
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump wish american happi new year leav...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,hous intellig committe chairman devin nune go ...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,friday reveal former milwauke sheriff david cl...,News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,christma day donald trump announc would back w...,News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,pope franci use annual christma day messag reb...,News,"December 25, 2017",0


In [20]:
# Seperating the data and label
X = news_df['title'].values
y = news_df['label_num'].values

In [21]:
print(y)

[0 0 0 ... 1 1 1]


In [22]:
y.shape

(44898,)

In [23]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()

In [24]:
X = vectorizer.fit_transform(X)

In [25]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 546512 stored elements and shape (44898, 20896)>
  Coords	Values
  (0, 5932)	0.2431392868466219
  (0, 19217)	0.10609065521058747
  (0, 16582)	0.3581240913256473
  (0, 13168)	0.22514499249236025
  (0, 6432)	0.3615843840752321
  (0, 12603)	0.21027646621750126
  (0, 20766)	0.28072894903403384
  (0, 6797)	0.4172599952831021
  (0, 11846)	0.31627667524755754
  (0, 18721)	0.22247450918405956
  (0, 10061)	0.18181143680883677
  (0, 5825)	0.374472403520307
  (1, 19217)	0.10675356894294173
  (1, 6124)	0.42175002803412853
  (1, 2622)	0.42574313397950775
  (1, 17650)	0.3990276327726366
  (1, 17709)	0.4041487602907992
  (1, 16074)	0.26124615957705505
  (1, 3911)	0.37527720482516563
  (1, 9994)	0.312780377780624
  (2, 16787)	0.2776331121343952
  (2, 4934)	0.2997479403473814
  (2, 3691)	0.3297005371015939
  (2, 2022)	0.30798602949314685
  (2, 1075)	0.2180356626195661
  :	:
  (44895, 2022)	0.35617176719079113
  (44895, 7868)	0.18659307273352

Train Test Split

In [26]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

Training the Model: Logistic Regression

In [28]:
# Training the Model: Logistic Regression
model = LogisticRegression()

In [29]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluation

In [30]:
# Accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [31]:
print(training_data_accuracy)

0.969207639623587


In [32]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print(test_data_accuracy)

0.9514476614699332


Predictive System

In [38]:
X_new = vectorizer.transform([" As U.S. budget fight looms, Republicans flip their fiscal script"])
prediction = model.predict(X_new)
if([prediction[0]==0]):
    print("The news is Fake")
else:
    print("The news is Real")

print(prediction)

The news is Fake
[1]
