In [1]:
import pandas as pd
import keras
import keras_metrics
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

In [2]:
# Training the tokenizer and using that tokenizer to convert a list of tokens to a list of numbers
# Instantiating and fit the tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train['cleanned_text'])

# Use the tokenizer to transform the text messages in the training and test sets
x_train_seq = tokenizer.texts_to_sequences(x_train['cleanned_text'])
x_test_seq = tokenizer.texts_to_sequences(x_test['cleanned_text'])

In [3]:
# Padding each sequesne to the same length, '50' here: Padding the one less than 50 to 50 with 0s
# and truncating the longer ones down to 50
x_train_seq_padded = pad_sequences(x_train_seq, 50)
x_test_seq_padded = pad_sequences(x_test_seq, 50)

In [4]:
# Constructing a simple RNN model
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout = 0, recurrent_dropout = 0))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          263904    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 273,313
Trainable params: 273,313
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Compiling the model
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy', keras_metrics.precision(), keras_metrics.recall()])

In [6]:
import warnings
warnings.filterwarnings("ignore")
# Fit the model
history = model.fit(x_train_seq_padded, y_train,
                    batch_size = 32, epochs = 5,
                    validation_data = (x_test_seq_padded, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
# TFIDF: Accuracy: 0.9695 / Precision: 1.0 / Recall: 0.7888
# The RNN model outperformed Word_2_Vec and Doc_2_Vec in all the 3 metrics, and the TFIDF model in accuracy and recall 
# metrics while the precision is almost as good. The improvement in recall though is significant.