# Implementing LSTM

Using Fake news classifier

# 1)- Import Key Modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hassan.sherwani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# 2- Loading and preparing data

- Dataset: https://www.kaggle.com/c/fake-news/data#

In [4]:
df=pd.read_csv('train.csv')
df.shape

(20800, 5)

In [5]:
df.head(3)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1


In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
###Drop Nan Values
df=df.dropna()
df.shape

(18285, 5)

In [8]:
df.to_csv('ready_data.csv', index=0)

In [9]:
df=pd.read_csv('ready_data.csv')
df.shape

(18285, 5)

In [10]:
## Get the Independent Features

X=df[['title']]

In [11]:
## Get the Dependent features
y=df['label']

In [12]:
print(X.shape)
print(y.shape)

(18285, 1)
(18285,)


### 2.1.Corpus building

In [13]:
messages=X.copy()

In [14]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

### 2.2.One_hot encoding

In [16]:
### Vocabulary size
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[:5]

[[1300, 366, 4202, 2417, 1428, 976, 3785, 4799, 4508, 449],
 [2578, 4459, 1049, 4563, 409, 3146, 1056],
 [2291, 2421, 3470, 476],
 [4852, 4171, 2826, 4964, 3213, 4456],
 [3067, 409, 3319, 4246, 3143, 323, 409, 1357, 4245, 262]]

[hous dem aid even see comey letter jason chaffetz tweet] is encoded as

[4355, 4269, 4829, 4248, 2507, 151, 842, 857, 2473, 1444]

### 2.3.Embedding Representation

In [17]:
#padding
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs[:5])

[[   0    0    0    0    0    0    0    0    0    0 1300  366 4202 2417
  1428  976 3785 4799 4508  449]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 2578
  4459 1049 4563  409 3146 1056]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 2291 2421 3470  476]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  4852 4171 2826 4964 3213 4456]
 [   0    0    0    0    0    0    0    0    0    0 3067  409 3319 4246
  3143  323  409 1357 4245  262]]


Notice that sentence lenth is 20 and our 1st sentence has 10 words. So all other words are padded as zeros.

# 3- Model

In [18]:
embedding_vector_features=40 # dimension of embedding layer
voc_size=5000
sent_length=20
output_layer= 1 
epochs=10
batch_size=64

### 3.1. Defining basic model architecture

In [19]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100)) #what's this
model.add(Dense(1,activation='sigmoid'))

In [20]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [21]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
len(embedded_docs),y.shape

(18285, (18285,))

In [23]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [24]:
print(X_final.shape)
print(y_final.shape)

(18285, 20)
(18285,)


### 3.2.split train_test

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.20, random_state=42)

### 3.3.Model Training

In [26]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=epochs,batch_size=batch_size)

Train on 14628 samples, validate on 3657 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x18c5baf0488>

### 3.4. evaluate model

In [27]:
y_pred=model.predict_classes(X_test)
y_pred[:5]

array([[1],
       [0],
       [0],
       [0],
       [1]])

In [28]:
confusion_matrix(y_test,y_pred)

array([[1905,  177],
       [ 171, 1404]], dtype=int64)

In [29]:
accuracy_score(y_test,y_pred)

0.9048400328137818

### 3.5.adding dropout

results are sligtly improved

In [30]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model_dropout=Sequential()
model_dropout.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model_dropout.add(Dropout(0.3))
model_dropout.add(LSTM(100))
model_dropout.add(Dropout(0.3))
model_dropout.add(Dense(1,activation='sigmoid'))
model_dropout.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [31]:
model_dropout.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=epochs,batch_size=batch_size)

Train on 14628 samples, validate on 3657 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x18c637325c8>

In [32]:
y_pred=model_dropout.predict_classes(X_test)
y_pred[:5]

array([[1],
       [0],
       [0],
       [0],
       [1]])

In [33]:
accuracy_score(y_test,y_pred)

0.9119496855345912