## Fake News Classifier Using Bidirectional LSTM

**Competition Link :** https://www.kaggle.com/c/fake-news/overview

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width: 100% !important; }</style>"))
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pylab import rcParams
rcParams['figure.figsize'] = 22, 7
import warnings
warnings.filterwarnings('ignore')

In [18]:
import pandas as pd 
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\92304\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('./data/train.csv')
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [4]:
df.dropna(inplace=True)

In [5]:
X = df.drop('label', axis=1)
y = df['label']
X.shape, y.shape

((18285, 4), (18285,))

In [6]:
vocab_size = 5000

### One Hot Representation

In [7]:
messages = X.copy()
messages.reset_index(inplace=True)

In [8]:
## Data Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[0:3]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire']

In [10]:
onehot_repr = [one_hot(word, vocab_size) for word in corpus]
onehot_repr

[[657, 4107, 755, 4150, 1394, 945, 2450, 1488, 4586, 8],
 [930, 1903, 3175, 3889, 2528, 1641, 1159],
 [1193, 4994, 3932, 2694],
 [2543, 2909, 2536, 4408, 3468, 2024],
 [4439, 2528, 3268, 993, 3517, 2398, 2528, 2541, 4216, 1448],
 [3616,
  383,
  981,
  1850,
  1422,
  4718,
  2785,
  3456,
  2014,
  2223,
  443,
  3337,
  3796,
  3598,
  1159],
 [4526, 961, 4120, 1260, 4399, 2206, 3068, 4409, 4471, 3258, 3132],
 [3784, 1428, 4037, 2013, 3566, 1938, 4718, 4731, 4471, 3258, 3132],
 [2259, 4972, 1236, 3090, 31, 1835, 3999, 1559, 4718, 4975],
 [1931, 2009, 3312, 2834, 1967, 4600, 1090, 2815],
 [1285, 1701, 1850, 310, 1585, 3141, 7, 1351, 1145, 1309, 137],
 [4408, 2882, 1394, 1835, 4718, 3566],
 [529, 2172, 4858, 2156, 1264, 3206, 3447, 945, 1979],
 [2012, 3189, 4870, 4473, 4965, 967, 3880, 4471, 3258, 3132],
 [4450, 3068, 2598, 3863, 2776, 4471, 3258, 3132],
 [1925, 521, 1051, 3860, 3277, 170, 3674, 1935, 2725, 3028],
 [529, 1021, 1903],
 [2042, 3563, 2445, 4028, 4718, 2534, 668, 1159],
 [

### Embedding Representaion 

In [11]:
sent_len = 20
embadded_docs = pad_sequences(onehot_repr, padding='pre', maxlen= sent_len)
print(embadded_docs)
print(embadded_docs[0])

[[   0    0    0 ... 1488 4586    8]
 [   0    0    0 ... 2528 1641 1159]
 [   0    0    0 ... 4994 3932 2694]
 ...
 [   0    0    0 ... 4471 3258 3132]
 [   0    0    0 ...  385 1921 4204]
 [   0    0    0 ... 1083 3710  555]]
[   0    0    0    0    0    0    0    0    0    0  657 4107  755 4150
 1394  945 2450 1488 4586    8]


In [20]:
## Creating The Model 
embedding_vector_features = 40 
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length= sent_len))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
dropout_4 (Dropout)          (None, 20, 40)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               112800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
len(embadded_docs), y.shape

(18285, (18285,))

In [22]:
## Coverting the input features to array
X_final = np.array(embadded_docs)
y_final = np.array(y)
X_final.shape, y_final.shape

((18285, 20), (18285,))

In [23]:
X_train, X_test , y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)
X_train.shape,X_test.shape, y_train.shape, y_test.shape

((12250, 20), (6035, 20), (12250,), (6035,))

In [24]:
%%time 
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size= 64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Wall time: 1min 3s


<keras.callbacks.History at 0x1eae8a47820>

## Performance Metrics and Accuracy

In [25]:
print(classification_report(y_test, model.predict(X_test)>0.5))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3419
           1       0.88      0.91      0.89      2616

    accuracy                           0.91      6035
   macro avg       0.90      0.91      0.90      6035
weighted avg       0.91      0.91      0.91      6035

