# Implementing LSTM

Using Fake news classifier

# 1)- Import Key Modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hassan.sherwani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# 2- Loading and preparing data

- Dataset: https://www.kaggle.com/c/fake-news/data#

In [4]:
df=pd.read_csv('train.csv')
df.shape

(20800, 5)

In [5]:
df.head(3)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1


In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
###Drop Nan Values
df=df.dropna()
df.shape

(18285, 5)

In [8]:
df.to_csv('ready_data.csv', index=0)

In [9]:
df=pd.read_csv('ready_data.csv')
df.shape

(18285, 5)

In [10]:
## Get the Independent Features

X=df[['title']]

In [11]:
## Get the Dependent features
y=df['label']

In [12]:
print(X.shape)
print(y.shape)

(18285, 1)
(18285,)


### 2.1.Corpus building

In [13]:
titles=X.copy()

In [14]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(titles)):
    review = re.sub('[^a-zA-Z]', ' ', titles['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

### 2.2.One_hot encoding

In [16]:
### Vocabulary size
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[:5]

[[4353, 2107, 3696, 190, 4014, 56, 2774, 1741, 2048, 1582],
 [1561, 1146, 1382, 2049, 4158, 3757, 3695],
 [3497, 1692, 755, 4558],
 [3965, 2120, 2303, 3894, 2523, 3275],
 [3484, 4158, 1283, 2527, 4078, 2157, 4158, 3397, 4874, 3230]]

[hous dem aid even see comey letter jason chaffetz tweet] is encoded as

[4353, 2107, 3696, 190, 4014, 56, 2774, 1741, 2048, 1582]

### 2.3.Embedding Representation

In [17]:
#padding
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs[:5])

[[   0    0    0    0    0    0    0    0    0    0 4353 2107 3696  190
  4014   56 2774 1741 2048 1582]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 1561
  1146 1382 2049 4158 3757 3695]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 3497 1692  755 4558]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  3965 2120 2303 3894 2523 3275]
 [   0    0    0    0    0    0    0    0    0    0 3484 4158 1283 2527
  4078 2157 4158 3397 4874 3230]]


Notice that sentence lenth is 20 and our 1st sentence has 10 words. So all other words are padded as zeros.

# 3- Model

In [18]:
embedding_vector_features=40 # dimension of embedding layer
voc_size=5000
sent_length=20
output_layer= 1 
epochs=10
batch_size=64

### 3.1. Defining basic model architecture

In [19]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100)) # one lstm layer with 100 neurons
model.add(Dense(output_layer,activation='sigmoid'))

In [20]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [21]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


##### Summary params calculation

https://datascience.stackexchange.com/questions/10615/number-of-parameters-in-an-lstm-model

**There are three steps:**

- For embedding layer: Dim of embedded layer(40) * vocab_size(5000)

- For LSTM Layer: Now formula is **#of params=g*[h(h+i)+biase]**

where g=lstm layer length i.e 1. For FFNS, RNN has 1 , LSTM has 4 and GRU has 3 layers<br>
h=hidden layer size (number of neurons in hidden layer) i.e 100<br> 
i=Input size/dimension i.e embedding_vector_features that is coming as an input on lstm layer in step2 (40 is value) <br>
biase= 100 of neurons for outpt of lstm layer

- For Dense (Output): lstm layer acting as input(100) * neurons in output layer(1) + biase(1 as there is only one neuron in output layer)

In [22]:
len(embedded_docs),y.shape

(18285, (18285,))

In [23]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [24]:
print(X_final.shape)
print(y_final.shape)

(18285, 20)
(18285,)


### 3.2.split train_test

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.20, random_state=42)

### 3.3.Model Training

In [26]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=epochs,batch_size=batch_size)

Train on 14628 samples, validate on 3657 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x253819058c8>

### 3.4. evaluate model

In [27]:
y_pred=model.predict_classes(X_test)
y_pred[:5]

array([[1],
       [0],
       [0],
       [1],
       [1]])

In [28]:
confusion_matrix(y_test,y_pred)

array([[1918,  164],
       [ 170, 1405]], dtype=int64)

In [29]:
accuracy_score(y_test,y_pred)

0.9086683073557561

### 3.5.adding dropout

results are sligtly improved

In [30]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model_dropout=Sequential()
model_dropout.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model_dropout.add(Dropout(0.3))
model_dropout.add(LSTM(100))
model_dropout.add(Dropout(0.3))
model_dropout.add(Dense(1,activation='sigmoid'))
model_dropout.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model_dropout.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
model_dropout.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=epochs,batch_size=batch_size)

Train on 14628 samples, validate on 3657 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2538c4b66c8>

In [32]:
y_pred=model_dropout.predict_classes(X_test)
y_pred[:5]

array([[1],
       [0],
       [0],
       [1],
       [1]])

In [33]:
accuracy_score(y_test,y_pred)

0.9141372709871479

We have got slightly better results by adding dropout. As our tarining accuracy slightly less on overfitting pattern