### Fake news Classifier using BiDirectional LSTM



In [1]:
import pandas as pd

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [2]:
df = df.dropna()
df.shape

(18285, 5)

In [3]:
## Get Independent Features
X = df.drop('label',axis=1)

## Get Dependent Feature
y = df['label']

print(X.shape,y.shape) 

(18285, 4) (18285,)


### Text Preprocessing or Data Preprocessing

In [4]:
messages = X.copy()
messages.reset_index(inplace=True)
messages.head()

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [5]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [8]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
ps = PorterStemmer()
wn = WordNetLemmatizer()
corpus = []

for i in range(0,len(messages)):
    ## Substitute all character except a-zA-Z
    review = re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [wn.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review) ### It will make a complete sentence , If not written words will be splitted in each sentence
    corpus.append(review)
    
corpus

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified',
 'iranian woman jailed fictional unpublished story woman stoned death adultery',
 'jackie mason hollywood would love trump bombed north korea lack trans bathroom exclusive video breitbart',
 'beno hamon win french socialist party presidential nomination new york time',
 'back channel plan ukraine russia courtesy trump associate new york time',
 'obama organizing action partner soros linked indivisible disrupt trump agenda',
 'bbc comedy sketch real housewife isi cause outrage',
 'russian researcher discover secret nazi military base treasure hunter arctic photo',
 'u official see link trump russia',
 'yes paid government troll social medium blog forum website',
 'major league soccer argentine find home success new york time',
 'well fargo chief abruptly step new york time',
 'anonymous donor pay 

### Learn an Embedding

In [10]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional

### One Hot Representation

In [11]:
# integer encode the documents
vocab_size = 5000
encoded_docs_one_hot = [one_hot(words, vocab_size) for words in corpus]
print(encoded_docs_one_hot)

### Each word will be converted to Integer (Index in Vocab Size)

[[998, 4079, 840, 3612, 4397, 1081, 4098, 3462, 2613, 2939], [884, 4785, 2288, 1870, 742, 3423, 3000], [2658, 2870, 3060, 2414], [4426, 1551, 588, 1311, 2789, 2291], [558, 742, 3295, 3060, 3148, 2239, 742, 4967, 1663, 99], [2410, 2075, 1649, 2842, 428, 27, 1579, 212, 421, 3010, 4290, 588, 4816, 4802, 3000], [430, 1958, 1145, 3484, 4362, 3180, 3837, 1308, 162, 918, 3782], [2203, 4542, 4596, 570, 1201, 3066, 27, 2877, 162, 918, 3782], [3634, 1226, 1630, 3232, 3945, 97, 2432, 1670, 27, 2153], [238, 4973, 2959, 4771, 2857, 4267, 1935, 4747], [3070, 4262, 116, 1579, 2925, 3380, 978, 4125, 1993, 1882, 3419], [1311, 3928, 4397, 2555, 27, 1201], [3845, 995, 3347, 1844, 2055, 2395, 1221, 1127, 1935], [4525, 4140, 2721, 3475, 1308, 2994, 2123, 162, 918, 3782], [3253, 1441, 1677, 2343, 3495, 162, 918, 3782], [2539, 691, 557, 1540, 1241, 777, 1681, 4498, 2043, 4487], [4898, 4665, 4785], [2595, 1110, 3982, 2175, 27, 969, 215, 3000], [2607, 1035, 2288, 4063, 183, 4695, 1368, 2106, 2239], [676, 2779,

### Using Padding Sequences


In [12]:
max([len(sen.split(' ')) for sen in corpus ])

47

In [13]:
### Find Max no of words in Whole lists of sentence 
# sent_length=8
sent_length = max([len(sen.split(' ')) for sen in corpus ])
embedded_docs=pad_sequences(encoded_docs_one_hot,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3462 2613 2939]
 [   0    0    0 ...  742 3423 3000]
 [   0    0    0 ... 2870 3060 2414]
 ...
 [   0    0    0 ...  162  918 3782]
 [   0    0    0 ... 1645 1238 1572]
 [   0    0    0 ... 4849  225 4008]]


In [14]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,  998, 4079,  840, 3612, 4397, 1081, 4098,
       3462, 2613, 2939], dtype=int32)

### Creating Model

In [15]:
### Now Each Word will be converted to 40 Dimensions -- 47 OF Sen1 -- Each integer of 47 will be converted to 40 features
embedding_vector_features=40 
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))  ### LSTM model is replaced with these line , You only need to change these for Bidirectional 
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 40)            200000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
import numpy as np

X_final = np.array(embedded_docs)
y_final = np.array(y)

X_final.shape,y_final.shape

((18285, 47), (18285,))

### Train Test split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((12250, 47), (12250,), (6035, 47), (6035,))

### Model Training 

In [18]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f00f7ed4240>

### Performance Metrics

In [19]:
y_pred=model.predict_classes(X_test)

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



0.912013256006628
[[3092  327]
 [ 204 2412]]
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3419
           1       0.88      0.92      0.90      2616

    accuracy                           0.91      6035
   macro avg       0.91      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035



### Adding Dropout

In [20]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(vocab_size,embedding_vector_features,input_length=sent_length))
model1.add(Dropout(0.3))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 47, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 47, 40)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               112800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f01008a3320>

In [24]:
y_pred=model1.predict_classes(X_test)

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



0.9161557580778791
[[3108  311]
 [ 195 2421]]
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      3419
           1       0.89      0.93      0.91      2616

    accuracy                           0.92      6035
   macro avg       0.91      0.92      0.92      6035
weighted avg       0.92      0.92      0.92      6035



### Accuracy increase after we use Dropout layer