# Fake News Classifier
 
Data : https://www.kaggle.com/jruvika/fake-news-detection/home

In [1]:
# Hide deprecated warnings of sklearn package
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


Import the data.

In [None]:
import pandas as pd
df = pd.read_csv("Data/data.csv")
df.head()

In [2]:
# Drop the data with null or undefined values
df = df.dropna()
# Get the number of each label in the data
fake = df[df.Label ==  0]
real = df[df.Label ==  1]

print('Number of Fake Articles -> ', fake.shape)
print('Number of Real Articles -> ', real.shape)

# Max Count of words in Document - Split by 10,000
max = 0
total = 0
count = 0;
for i in range(df.shape[0]):
    length = len(df.iloc[i,2])
    total += length
    count += 1
    if(length > max):
        max = length
    
print("Maximum Length ", max)
print("Avg Length", total / count)

Number of Fake Articles ->  (2120, 4)
Number of Real Articles ->  (1868, 4)
Maximum Length  32767
Avg Length 2941.288365095286


### Train/Test Split

Using Stratified sampling, split the data into 70-30.

In [3]:
from sklearn.model_selection import train_test_split
y = df.pop('Label')
x = df

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

train_count = y_train.value_counts()
test_count = y_test.value_counts()

print('Number of Fake articles in Training set -> ', train_count[0])
print('Number of Real articles in Training set -> ', train_count[1])
print('Number of Fake articles in Testing set -> ', test_count[0])
print('Number of Real articles in Testing set -> ', test_count[1])

Number of Fake articles in Training set ->  1696
Number of Real articles in Training set ->  1494
Number of Fake articles in Testing set ->  424
Number of Real articles in Testing set ->  374


### Data Preprocessing

* Tokenization
* Normalization
    * Lowercase all the words
    * Negation Handling
    * Remove Stopwords
    * Remove punctuations and Empty Strings from the array
* Stemming

Source - https://medium.com/@annabiancajones/sentiment-analysis-of-reviews-text-pre-processing-6359343784fb

#### Setup:

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


ps = PorterStemmer()

# Load the Apostrophes connecting words
appos_file = open('appos.txt','r')
appos = eval(appos_file.read())
appos_file.close()

# Function returns the negation handled word if it is presend in the appos dictionary
# Else returns the word itself
def negationHandling(word):
    if word in appos:
        return appos[word]
    else:
        return word
    
# Check if a word is a Stopword
# Stopword is a word that is commonly present in most of the documents and does not affect the model
def isNotStopWord(word):
    return word not in stopwords.words('english')

# Function to preprocess a single article
# Document refers to the text of the Article.
def processDocument(document):
    sentences = nltk.sent_tokenize(document)
    tokens = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        #Converting to LowerCase
        words = map(str.lower, words)
        
        # Negation Handling map is'nt to is not : 
        words = map(lambda x: negationHandling(x), words)
        
        # Remove stop words
        words = filter(lambda x: isNotStopWord(x), words)
        
        # Removing punctuations except '<.>/<?>/<!>'
        punctuations = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'
        words = map(lambda x: x.translate(str.maketrans('', '', punctuations)), words)
        
        # Remove empty strings
        words = filter(lambda x: len(x) > 0, words)
        
        # stemming
        words = map(lambda x: ps.stem(x), words)
        
        # Adding the preprocessed words to the document
        tokens = tokens + list(words)
        
    return tokens    

#### Process the data:

In [5]:
# Processing the body i.e. test of the Article
train_Body = x_train.loc[:,'Body']
train_docBodyWordArray = []

for i in range(x_train.shape[0]):
    train_docBodyWordArray.append(train_Body.iloc[i])
        
train_BodywordArray = list(map(lambda x: processDocument(x), train_docBodyWordArray))
print("Preprocessing Completed for Body of training data")

# Process the Headlines of the training data.
train_headLine = x_train.loc[:,'Headline']

train_docHeadLineWordArray = []
for i in range(x_train.shape[0]):
    train_docHeadLineWordArray.append(train_headLine.iloc[i])
        
train_HeadLineArray = list(map(lambda x: processDocument(x), train_docHeadLineWordArray))
print("Preprocessing Completed for HeadLine of training data")

Preprocessing Completed for Body of training data
Preprocessing Completed for HeadLine of training data


### Doc2Vec Model Training

In [7]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_DocBodyData = [TaggedDocument(words=train_BodywordArray[i], tags=[str(i)]) for i, _d in enumerate(train_BodywordArray)]

max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(
    vector_size = vec_size,
    alpha = alpha, 
    min_alpha = 0.025,
    min_count = 5,
    window = 10,
    dm = 1)

model.build_vocab(tagged_DocBodyData)
print('Training Doc2Vec Model')

for epoch in range(max_epochs):
    if ((epoch + 1) % 10 == 0):
        print('Training iteration {0}'.format(epoch + 1))
    model.train(tagged_DocBodyData,total_examples = model.corpus_count, epochs = model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



Training iteration 9
Training iteration 19
Training iteration 29
Training iteration 39
Training iteration 49
Training iteration 59
Training iteration 69
Training iteration 79
Training iteration 89
Training iteration 99
Model Saved


### Preprocess the testing data

In [9]:
# Testing Data Body of the article
test_headLine = x_test.loc[:,'Headline']

test_docHeadLineWordArray = []
for i in range(x_test.shape[0]):
    test_docHeadLineWordArray.append(test_headLine.iloc[i])
        
test_HeadLineArray = list(map(lambda x: processDocument(x), test_docHeadLineWordArray))
print("Preprocessing Completed for HeadLine of testing data")

# Testing Data Headline of the article
test_body = x_test.loc[:,'Body']

test_docBodyWordArray = []
for i in range(x_test.shape[0]):
    test_docBodyWordArray.append(test_body.iloc[i])
        
test_BodyArray = list(map(lambda x: processDocument(x), test_docBodyWordArray))
print("Preprocessing Completed for Body of testing data")

Preprocessing Completed for HeadLine of testing data
Preprocessing Completed for Body of testing data


### Get word vectors using the trained doc2vec model

In [10]:
import numpy
model = Doc2Vec.load("d2v.model")

# Training set Body Word Vector  
train_bodyVector = []
for i in range(x_train.shape[0]):
    train_bodyVector.append(model.docvecs[i])

# Training data set Headline Word Vectors
train_headLineVector = []
for i in range(x_train.shape[0]):
    train_headLineVector.append(model.infer_vector(train_HeadLineArray[i]))

# Testing set Headline Word Vectors
test_headLineVector = []
for i in range(x_test.shape[0]):
    test_headLineVector.append(model.infer_vector(test_HeadLineArray[i]))    

# Testing set Body Word Vector
test_bodyVector = []
for i in range(x_test.shape[0]):
    test_bodyVector.append(model.infer_vector(test_BodyArray[i]))
    
# Create Numpy Array for training data to train sklearn models
np_trainHeadline = numpy.array([numpy.array(xi) for xi in train_headLineVector]) 
np_trainBody = numpy.array([numpy.array(xi) for xi in train_bodyVector])

inp_x_train = []
for i in range(x_train.shape[0]):
    inp_x_train.append(numpy.concatenate((np_trainHeadline[i], np_trainBody[i])))

inp_x_train = numpy.array(inp_x_train)

# Create Numpy Array for testing data to train sklearn models
np_testHeadline = numpy.array([numpy.array(xi) for xi in test_headLineVector])
np_testBody = numpy.array([numpy.array(xi) for xi in test_bodyVector])

inp_x_test = []
for i in range(x_test.shape[0]):
    inp_x_test.append(numpy.concatenate((np_testHeadline[i],np_testBody[i])))

inp_x_test = numpy.array(inp_x_test)

print('Shape of the numpy training data', inp_x_train.shape)
print('Shape of the numpy training data', inp_x_test.shape)

Shape of the numpy training data (3190, 600)
Shape of the numpy training data (798, 600)


### SVM

Using GridSearchCV to find the optimal parameters for this training the model

In [12]:
from sklearn import svm

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

C = [0.1, 0.5, 1, 5, 10, 50]
param_grid = [
    {'C': C, 'kernel': ['linear']},
    {'C': C, 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
    {'degree': [2,3,4], 'kernel': ['poly']},
    {'coef0': [0.0], 'kernel': ['sigmoid']} 
]

table = {}

score_metric = 'accuracy'
clf = GridSearchCV(svm.SVC(), param_grid, cv = 5, scoring = score_metric)
clf.fit(inp_x_train, y_train)
print("Best parameters set found :", clf.best_params_)
means = clf.cv_results_['mean_test_score']

for mean, params in zip(means, clf.cv_results_['params']):
    if params == clf.best_params_:
        print("%s -> %0.3f" % (score_metric, mean))
    key = str(params)
    if key not in table:
        table[key] = []
    table[key].append("%0.3f" % (mean))
print()



Best parameters set found : {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
accuracy -> 0.917



#### Train the SVC with above parameter

In [14]:
from sklearn.metrics import accuracy_score

svm_model = svm.SVC(C = 10, gamma = 0.0001, kernel = 'rbf')
svm_model.fit(inp_x_train, y_train)
y_pred = svm_model.predict(inp_x_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy -> ', accuracy)

Accuracy ->  0.7192982456140351


### Gaussian Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(inp_x_train, y_train)

nb_y_pred = gnb.predict(inp_x_test)

accuracy = accuracy_score(y_test, nb_y_pred)
print('Accuracy -> ', accuracy)

Accuracy ->  0.5325814536340853


### Decision Tree 

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(inp_x_train, y_train)

dt_y_pred = dt_clf.predict(inp_x_test)

accuracy = accuracy_score(y_test, dt_y_pred)
print('Accuracy -> ', accuracy)

Accuracy ->  0.4548872180451128
