# Detecting Fake News with Python and Machine Learning

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier


In [13]:
#Reading data from csv file into DataFrame
df = pd.read_csv('C:\\Users\\Kirti Sharma\\Documents\\news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [14]:
#Finding NaN values in dataframe if any
df.columns[df.isna().any()]

Index([], dtype='object')

In [15]:
g = df.groupby('label')
g.describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FAKE,3164.0,7912.297092,1514.701524,5280.0,6627.75,7904.0,9231.25,10557.0
REAL,3171.0,2654.344056,1531.720152,2.0,1317.5,2676.0,3992.5,5276.0


In [16]:
#Converting the text in label column into numbers

df['label'] = df.label.apply(lambda x: 1 if x == 'FAKE' else 0)
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


In [17]:
#Split the dataset into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'],df.label, test_size=0.2)

In [18]:
type(X_train)

pandas.core.series.Series

In [19]:
#Initialize a TfidfVectorizer
tfidf_vec = TfidfVectorizer(stop_words = 'english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train = tfidf_vec.fit_transform(X_train)
tfidf_test = tfidf_vec.transform(X_test)

In [20]:
#Initialise the PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(max_iter=50)
clf.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [21]:
#Predicting the test set and calculating score
y_predicted = clf.predict(tfidf_test)
clf.score(tfidf_test,y_test)

0.9423835832675612