## **BiLSTM with random initilization**

**Load the required dependencies and Keras**


In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py


In [None]:
!pip install sentencepiece


Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 21.1MB/s eta 0:00:01[K     |▌                               | 20kB 17.1MB/s eta 0:00:01[K     |▉                               | 30kB 14.0MB/s eta 0:00:01[K     |█                               | 40kB 13.0MB/s eta 0:00:01[K     |█▍                              | 51kB 8.5MB/s eta 0:00:01[K     |█▋                              | 61kB 8.7MB/s eta 0:00:01[K     |██                              | 71kB 9.0MB/s eta 0:00:01[K     |██▏                             | 81kB 10.1MB/s eta 0:00:01[K     |██▌                             | 92kB 9.1MB/s eta 0:00:01[K     |██▊                             | 102kB 8.1MB/s eta 0:00:01[K     |███                             | 112kB 8.1MB/s eta 0:00:01[K     |███▎              

In [None]:
pip install --upgrade keras

Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)


In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
import keras
import tensorflow as tf
from keras.layers import LSTM
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load the csv files**

In [None]:
cod_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fake-news/train")
cod_train2, test= train_test_split(cod_train, random_state=0, test_size=0.2)
train, val =  train_test_split(cod_train2, random_state = 0,test_size=0.2)

**Tokenizer**

In [None]:
max_features = 100000 # max num words
maxlen = 250 
embedding_size = 200

# create the tokenizer with the maximum number of words to keep, 
# based on word frequency. 
# Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=max_features, oov_token = True)

train['text']=train['text'].astype(str)
test['text']=test['text'].astype(str)
val['text']=val['text'].astype(str)

# fit the tokenizer on the headlines
tokenizer.fit_on_texts(list(train['text']))

# Transforms each text in texts to a sequence of integers.
train_X = tokenizer.texts_to_sequences(train['text'])
test_X = tokenizer.texts_to_sequences(test['text'])
val_X = tokenizer.texts_to_sequences(val['text'])

# transforms a list of num_samples sequences (lists of integers)
# into a 2D Numpy array of shape (num_samples, num_timesteps).
train_X = pad_sequences(train_X, maxlen = maxlen)
test_X = pad_sequences(test_X, maxlen = maxlen)
val_X = pad_sequences(val_X, maxlen = maxlen)

train_y = train['label']
test_y = test['label']
val_y = val['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


**The BiLSTM construction**

- Activation Function: ReLU has been used as the activation function. It is a non-linear activation function, helping complex relationships in the data to be captured by the model.

- Optimizer: Adam optimizer, an adaptive learning rate optimizer.

- Loss function: The network will be trained to output a probability over the 2 classes using Sigmoid Loss.

In [None]:
sequence_length = train_X.shape[1]

model = Sequential()
#model.add(Embedding(max_features, embedding_size, weights = [embedding_matrix]))
model.add(Embedding(max_features, embedding_size, input_length = sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.TruePositives()])

**Save the best model, early stopping and fit the model**


In [None]:

# Save the model after every epoch.
saveBestModel = keras.callbacks.ModelCheckpoint('/content/drive/My Drive/TFMColab/best_model.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
# Stop training when a monitored quantity has stopped improving.
earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
batch_size = 100
epochs = 25
model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[saveBestModel, earlyStopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


<tensorflow.python.keras.callbacks.History at 0x7f8ac072e990>

**Load the metrics and show them**

In [None]:
loss, accuracy, precision, recall, true_positives = model.evaluate(test_X, test_y, batch_size=batch_size)




In [None]:
mult_pr=precision*recall
sum_pr=precision+recall
div=mult_pr/sum_pr
f1_score=2*div

In [None]:
print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)
print('True positives:',true_positives)

Loss: 0.22517189383506775
Accuracy: 0.9516826868057251
Precision: 0.9496943950653076
Recall: 0.9555345177650452
f1 score: 0.9526055055135503
True positives: 2020.0


In [None]:
pred_y = model.predict_classes(test_X, batch_size=batch_size)
confusion_matrix(test_y, pred_y)




array([[1939,  107],
       [  94, 2020]])

**Finally, extract False Positives and False Negatives to csv files**

In [None]:
def getFP_FN_TP_lists(test_X, test_y, pred_y):
    FP_text = []
    FP_index = []
    FN_text = []
    FN_index = []
    TP_text = []
    TP_index = []
    for i in range(len(test_y)):
        if(pred_y[i]==1 and test_y[test_y.index[i]]==0):
            FP_text.append(test['text'][test_y.index[i]])
            FP_index.append(test_y.index[i])
        elif(pred_y[i]==0 and test_y[test_y.index[i]]==1):
            FN_text.append(test['text'][test_y.index[i]])
            FN_index.append(test_y.index[i])
        elif(pred_y[i]==1 and test_y[test_y.index[i]]==1):
            TP_text.append(test['text'][test_y.index[i]])
            TP_index.append(test_y.index[i])        
            
    return FP_text,FP_index,FN_text,FN_index,TP_text,TP_index

def getFP_FN_TP(test_X, test_y, pred_y):
    FP_text,FP_index,FN_text,FN_index,TP_text,TP_index = getFP_FN_TP_lists(test_X, test_y, pred_y)
    d_FP = {'FP_text':FP_text,'FP_index':FP_index}
    df_FP = pd.DataFrame(d_FP)
    d_FN = {'FN_text':FN_text,'FN_index':FN_index}
    df_FN = pd.DataFrame(d_FN)
    d_TP =  {'TP_text':TP_text,'TP_index':TP_index}
    df_TP = pd.DataFrame(d_TP)
    
    return df_FP,df_FN,df_TP

df_FP,df_FN, df_TP = getFP_FN_TP(test_X, test_y, pred_y)
df_FP.to_csv('FP_BiLSTMrandom.csv', index=True)
df_FN.to_csv('FN_BiLSTMrandom.csv', index=True)
df_TP.to_csv('TP_BiLSTMrandom.csv', index=True)