In [1]:
import re
import string
import missingno  as msno
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
# read the 2 datasets 
fake_df = pd.read_csv("News/Fake.csv")
true_df = pd.read_csv("News/True.csv")

View some of the datasets details

In [3]:
print(fake_df.columns)
print(true_df.columns)

Index(['title', 'text', 'subject', 'date'], dtype='object')
Index(['title', 'text', 'subject', 'date'], dtype='object')


In [4]:
print(fake_df.shape)
print(true_df.shape)

(23481, 4)
(21417, 4)


First we add a New Columns to represent the labels 
" 1 " for real news dataset
" 0 " for fake news dataset

In [5]:
fake_df['label']=0
true_df['label']=1

In [6]:
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [7]:
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


Drop columns [Title,Subject,Data] Which are not needed in the model training

In [8]:
true_df.drop(columns=['title','subject','date'],axis=1,inplace=True)
fake_df.drop(columns=['title','subject','date'],axis=1,inplace=True)

In [9]:
true_df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [10]:
print(true_df.isnull().sum())
print(fake_df.isnull().sum())

text     0
label    0
dtype: int64
text     0
label    0
dtype: int64


In [11]:
# Merging the 2 dataframes
df_mixed = pd.concat([fake_df,true_df],axis=0)
df_mixed.head(-5)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21407,"SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",1
21408,GENEVA (Reuters) - North Korea and the United ...,1
21409,GENEVA (Reuters) - North Korea and the United ...,1
21410,COPENHAGEN (Reuters) - Danish police said on T...,1


In [12]:
# shuffling the dataframe 
df = df_mixed.sample(frac=1)
df.head()

Unnamed: 0,text,label
12859,MOSCOW (Reuters) - Russia s military said on T...,1
4080,WASHINGTON (Reuters) - The Trump administratio...,1
20400,GENEVA (Reuters) - The United Nations human ri...,1
14639,NAIROBI (Reuters) - Kenya s Supreme Court will...,1
704,WASHINGTON (Reuters) - U.S. Vice President Mik...,1


In [13]:
# reset the index 
df.reset_index(inplace=True)
# drop the index column
df.drop(columns=['index'],inplace=True)
df.head()

Unnamed: 0,text,label
0,MOSCOW (Reuters) - Russia s military said on T...,1
1,WASHINGTON (Reuters) - The Trump administratio...,1
2,GENEVA (Reuters) - The United Nations human ri...,1
3,NAIROBI (Reuters) - Kenya s Supreme Court will...,1
4,WASHINGTON (Reuters) - U.S. Vice President Mik...,1


In [14]:
def text_processing(article):
    article = article.lower()
    article = re.sub('\[.*?\]', '',article)
    article = re.sub("\\W"," ",article) 
    article = re.sub('https?://\S+|www\.\S+', '',article)
    article = re.sub('<.*?>+', '', article)
    article = re.sub('[%s]' % re.escape(string.punctuation), '',article)
    article = re.sub('\n', '', article)
    article = re.sub('\w*\d\w*', '',article)    
    return article

In [15]:
df['text']=df['text'].apply(text_processing)

Define X and Y 

In [16]:
x=  df['text']
y=  df['label']

Training and testing split

In [17]:
X_train, X_test, y_train , y_test = train_test_split(x,y,test_size=.3)

Convert Text To vectors

In [18]:
vec = TfidfVectorizer(stop_words='english', max_df=0.7)

In [19]:
xv_train = vec.fit_transform(X_train)
xv_test = vec.transform(X_test)

In [20]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(xv_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(xv_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(classification_report(y_test, y_pred))

Accuracy: 99.33%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7025
           1       0.99      0.99      0.99      6445

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470

