# Impact of RNN Architecture - Code Part I

In [34]:
import numpy as np
import statistics as st
import math
import tensorflow as tf
from keras.layers import GRU, SimpleRNN, Embedding, Dense, LSTM, Dropout
from keras.models import Sequential
from keras.metrics import Precision, Recall
from sklearn.utils import class_weight
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
import keras

### Data pre-processing

In [3]:
# Loading the dataset

full_dataset = pd.read_csv("C:/Users/natal/CSCI5922_NN/Lab3/spam_detection_rnn/data/SPAM text message 20170820 - Data.csv")

In [4]:
full_text = []
labels = []

for x, label in enumerate(full_dataset['Category']):
    full_text.append(full_dataset['Message'][x])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)

full_text = np.asarray(full_text)
labels = np.asarray(labels)

In [5]:
print("Size of separate messages: ", len(full_text))
print("Size of labels: ", len(labels))

Size of separate messages:  5572
Size of labels:  5572


### Dataset split - for basic full test split (all sequence lengths)

In [6]:
# features - number of words
max_features = 10000

# Splitting the data
train = int(5572 * .7)
#print(train)
test = int(5572 - train)
#print(test)

# Checking the shapes are correct
print(len(full_text) == (train + test))
print("Train size: ", train, "\nTest size: ", test)

# Tokenizning 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(full_text)
sequences = tokenizer.texts_to_sequences(full_text)

# after, stop putting more words
max_len = 500

idx_word = tokenizer.word_index
print("Tokens size: ", len(idx_word))

padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Checking the shape
print("Data size after tokenizing: ", padded_sequences.shape)

np.random.seed(42)
indices = np.arange(padded_sequences.shape[0])
np.random.shuffle(indices)
padded_sequences_idx = padded_sequences[indices]
labels = labels[indices]

x_train = padded_sequences_idx[:train]
y_train = labels[:train]
x_test = padded_sequences_idx[train:]
y_test = labels[train:]

# Checking the shapes after splitting
print("Train x shape after split: ", x_train.shape, "\nTrain y shape after split: ", y_train.shape)
print("Test x shape after split: ", x_test.shape, "\nTest y shape after split: ", y_test.shape)

True
Train size:  3900 
Test size:  1672
Tokens size:  9004
Data size after tokenizing:  (5572, 500)
Train x shape after split:  (3900, 500) 
Train y shape after split:  (3900,)
Test x shape after split:  (1672, 500) 
Test y shape after split:  (1672,)


### Analysis of the messages' length - necessary for further splitting the test set 

In [7]:
# Checking the length of messages to perform the division of the test set based on the number of words
# Creating the boundaries for: Short, medium and long inputs

texts_list = full_text.tolist()
print("\n------------------------------------------------------\n")
print(f"Sentence example from the messages:\n\n {texts_list[5]}")
print("\n------------------------------------------------------\n")

def tokenizing():
    splitted = []
    for x in texts_list:
        splitted.append(x.split(' '))
    return splitted

splitted_messages = tokenizing()
print(f"Checking the number of messages: {len(splitted_messages)}")
print("\n------------------------------------------------------\n")
print(f"Sentence example after tokenizing: \n\n {splitted_messages[5]}")
print("\n------------------------------------------------------\n")

def counting_length():
    lengths = []
    for x in splitted_messages:
        lengths.append(len(x))
    return lengths

print(f"Length of the first 20 messages: \n\n{counting_length()[:20]}")
print("\n------------------------------------------------------\n")

sorted_counts = sorted(counting_length(), reverse=True)
print(f"Sorted list; descending:\n\n{sorted_counts[:20]}")

div = math.floor(len(texts_list)/3)
long_lenghts = sorted_counts[:div]
print("\n------------------------------------------------------\n")
print(f"The mean of the first set in the desceding list (large): {st.mean(long_lenghts)}")

medium_lenghts = sorted_counts[div:int(2*div)]
print("\n------------------------------------------------------\n")
print(f"The mean of the first set in the desceding list (medium): {st.mean(medium_lenghts)}")
print(f"Lengths of 20 messages before the frist division: {sorted_counts[div-20:int(2*div)][:20]}")

short_lenghts = sorted_counts[2*div:]
print("\n------------------------------------------------------\n")
print(f"The mean of the first set in the desceding list (short): {st.mean(short_lenghts)}")
print(f"Lengths of 20 messages before the second division: {sorted_counts[(2*div-20):][:20]}")
print("\n------------------------------------------------------\n")

short=9
medium=18
long=170
print(f"Chosen boundaries for lengths:\n\nShort: {short}\nMedium: {medium}\nLong: {long}")


------------------------------------------------------

Sentence example from the messages:

 FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv

------------------------------------------------------

Checking the number of messages: 5572

------------------------------------------------------

Sentence example after tokenizing: 

 ['FreeMsg', 'Hey', 'there', 'darling', "it's", 'been', '3', "week's", 'now', 'and', 'no', 'word', 'back!', "I'd", 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still?', 'Tb', 'ok!', 'XxX', 'std', 'chgs', 'to', 'send,', '£1.50', 'to', 'rcv']

------------------------------------------------------

Length of the first 20 messages: 

[20, 6, 28, 11, 13, 32, 16, 26, 26, 29, 21, 26, 26, 37, 8, 19, 4, 19, 13, 24]

------------------------------------------------------

Sorted list; descending:

[171, 162, 125, 125, 121, 119, 99, 96, 96, 95, 89, 88, 80, 79, 79, 79

### Test data preparation for test-size experiments based on the sequence length

In [8]:
# Creating a dataframe with messages and word counts

#Checking the lengths of texts and labels
len(texts_list)
len(labels)

#Making sure the numpy arrays are converted to lists
type(texts_list)
type(labels)

labels_list = labels.tolist()

type(labels_list)

#Creating a dictionary and then making a dataframe 
dict_full_dataset = {'messages': texts_list, 'labels': labels_list}
df_full_dataset = pd.DataFrame(dict_full_dataset)
df_full_dataset.head()

#Creating an additional column with word counts
df_full_dataset['word_count'] = df_full_dataset['messages'].str.split().str.len()
df_full_dataset

#Setting df options to display example messages
pd.set_option('display.max_colwidth', None)

#Creating three new dataframes based on the conditions of the length of seqeunces
#Boundaries for nummber of words were created in the previous cells with the analysis of the dataset
df_short_messages = df_full_dataset[df_full_dataset['word_count']<short]
print("\n------------------------------------------------------\n")
print(f"Length of short messages dataset: {len(df_short_messages)}\n")
print(f"Example of a message: {df_short_messages['messages'].sample(1,ignore_index=True)}")

df_medium_messages = df_full_dataset[(df_full_dataset['word_count']<=medium) & (df_full_dataset['word_count']>=short)]
print("\n------------------------------------------------------\n")
print(f"Length of medium messages dataset: {len(df_medium_messages)}\n")
print(f"Example of a message: {df_medium_messages['messages'].sample(1,ignore_index=True)}")

df_long_messages = df_full_dataset[df_full_dataset['word_count']>medium]
print("\n------------------------------------------------------\n")
print(f"Length of long messages dataset: {len(df_long_messages)}\n")
print(f"Example of a long message: {df_long_messages['messages'].sample(1,ignore_index=True)}")

#Reset df options
pd.reset_option('display.max_colwidth')


------------------------------------------------------

Length of short messages dataset: 1861

Example of a message: 0    I didnt get anything da
Name: messages, dtype: object

------------------------------------------------------

Length of medium messages dataset: 1843

Example of a message: 0    I call you later, don't have network. If urgnt, sms me.
Name: messages, dtype: object

------------------------------------------------------

Length of long messages dataset: 1868

Example of a long message: 0    I don't want you to leave. But i'm barely doing what i can to stay sane. fighting with you constantly isn't helping.
Name: messages, dtype: object


### Splitting the dataset accroding to sequence size for experiments

In [9]:
#Extracting lists from df of short messages and checking length are compatible
short_list=df_short_messages['messages'].to_list()
short_list_labels=df_short_messages['labels'].to_list()
print(f"Checking the length of messages == length of labels: {len(short_list)==len(short_list_labels)}")

#Extracting lists from df of medium messages and checking length are compatible
m_list=df_medium_messages['messages'].to_list()
m_list_labels=df_medium_messages['labels'].to_list()
print(f"Checking the length of messages == length of labels: {len(m_list)==len(m_list_labels)}")

#Extracting lists from df of long messages and checking length are compatible
l_list=df_long_messages['messages'].to_list()
l_list_labels=df_long_messages['labels'].to_list()
print(f"Checking the length of messages == length of labels: {len(l_list)==len(l_list_labels)}")

Checking the length of messages == length of labels: True
Checking the length of messages == length of labels: True
Checking the length of messages == length of labels: True


In [10]:
#Creating numpy arrays
full_text_short = np.asarray(short_list)
labels_short = np.asarray(short_list_labels)

full_text_medium = np.asarray(m_list)
labels_medium = np.asarray(m_list_labels)

full_text_long = np.asarray(l_list)
labels_long = np.asarray(l_list_labels)
print(type(labels_long))

<class 'numpy.ndarray'>


In [11]:
#Since the full test data has ~1600 entries, I will not further split the created sets into train/test
#The current sets are ~1800 so it's comparable to the original full test set

#Steps as above to vectorize the sequences of words

def vectorize_data(x,y):
    # Tokenizning
    tokenizer = Tokenizer()
    fitted = tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)

    # after, stop putting more words
    max_len = 500

    idx_word = tokenizer.word_index
    print("Tokens size: ", len(idx_word))

    padded = pad_sequences(sequences, maxlen=max_len)

    # Checking the shape
    print("Data size after tokenizing: ", padded.shape)

    np.random.seed(42)
    indices = np.arange(padded.shape[0])
    np.random.shuffle(indices)
    padded_sequences_idx = padded[indices]
    labels = y[indices]

    x_data = padded_sequences_idx[:]
    y_data = labels[:]

    return x_data, y_data

In [12]:
x_test_short, y_test_short = vectorize_data(full_text_short, labels_short)
x_test_medium, y_test_medium = vectorize_data(full_text_medium, labels_medium)
x_test_long, y_test_long = vectorize_data(full_text_long, labels_long)

Tokens size:  2357
Data size after tokenizing:  (1861, 500)
Tokens size:  3846
Data size after tokenizing:  (1843, 500)
Tokens size:  6659
Data size after tokenizing:  (1868, 500)


## Models

### Test Model

In [25]:
# Vanilla test RNN model

model = Sequential()
model.add(Embedding(max_features, 64))
model.add(SimpleRNN(64,input_shape=x_train.shape,return_sequences=False, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer ='rmsprop',metrics=['acc'])

model_rnn = model.fit(x_train, y_train, epochs = 15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Analysis of first model_rnn

After running the very first model, it was observed that the model is probably overfitting the data. The accuracy after the epoch 15 is 100% and the model probably learns the noise. The model is not learning objectively and is fit too much to the training data. Before I actually start runnig the 3 models (vanilla RNN, LSTM and GRU), I will treat the model overfitting:
* reducing the network's capacity --> decreasing the number of units in the model's layers
* applying regularization technique --> penalizing very large weights
* changing optimizer to Adam
* increasing the batch size --> the model will learn lesser noise; it will help take a more reasonable 'step' for minima
* adding dropout layers --> by dropping some layer of the network, I will let the model generalize better on unseen examples; it won't be trained for details and noises of the training data

I will call this model a test model, apply the mentioned changes and start building the 3 RNN models for the experiments.

## Experiments

### Fine-grained analysis with respect to input length

Each model (vanilla RNN, LSTM and GRU) is ran on the full test set (all sequence lengths) and on experimental test sets by dividing the test set into short, medium and long inputs roughly equl-sized based on the number of words observed in the test set examples.

In [16]:
# Checking numbers of test sets

print("Lengths of test sets:")
test_sets=[y_test,y_test_short,y_test_medium,y_test_long]
for x in test_sets:
    print(len(x))

Lengths of test sets:
1672
1861
1843
1868


## Note: 
After running the three models on short and medium sets, I observed a very small precision and recall. After that, I ran the model again with the option to show tp, fp, tn and fn. After the division of the sets for input length difference, the data was significantly imbalanced. There are too little positive examples. The model below is another test model saved with the precision and recall computed for imbalanced data. The actual three models will have class weights passed in the model. The weights will force the algorithm to treat every entry of class 1 (positive/spam) as x entries of class 0. Then, the loss will become a weighted average. There were several experiemnts conducted on the class weigths. The ratio of 1:50 (pos:neg) was too high and caused the scarcity of the negative examples. The final weight was calucated automatically using the class_weight module in Keras library. After several manual experiments to set up the ratio to 1:25, 1:50 and 1:75, the 'balanced' weight computation included in Keras resulted in much better results.

In [54]:
count_non0 = np.count_nonzero(y_train)
count_0 = len(y_train) - count_non0
print(f"TRAIN\nPositive: {count_non0}\nNegative: {count_0}")

count_non0 = np.count_nonzero(y_test)
count_0 = len(y_test) - count_non0
print(f"TEST\nPositive: {count_non0}\nNegative: {count_0}")

count_non0 = np.count_nonzero(y_test_short)
count_0 = len(y_test_short) - count_non0
print(f"SHORT\nPositive: {count_non0}\nNegative: {count_0}")

count_non0 = np.count_nonzero(y_test_medium)
count_0 = len(y_test_medium) - count_non0
print(f"MEDIUM\nPositive: {count_non0}\nNegative: {count_0}")

count_non0 = np.count_nonzero(y_test_long)
count_0 = len(y_test_long) - count_non0
print(f"LONG\nPositive: {count_non0}\nNegative: {count_0}")

TRAIN
Positive: 522
Negative: 3378
TEST
Positive: 225
Negative: 1447
SHORT
Positive: 251
Negative: 1610
MEDIUM
Positive: 267
Negative: 1576
LONG
Positive: 229
Negative: 1639


### Test vanilla RNN model

In [40]:
# Resource: https://stackoverflow.com/questions/61835742/same-value-for-keras-2-3-0-metrics-accuracy-precision-and-recall

from keras import backend as K

def check_units(y_true, y_pred):
    if y_pred.shape[1] != 1:
      y_pred = y_pred[:,1:2]
      y_true = y_true[:,1:2]
    return y_true, y_pred

def precision(y_true, y_pred):
    y_true, y_pred = check_units(y_true, y_pred)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    y_true, y_pred = check_units(y_true, y_pred)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [41]:
# Test vanilla RNN model 

model = Sequential()
model.add(Embedding(max_features, 15))
model.add(SimpleRNN(15,input_shape=x_train.shape, activation='tanh',recurrent_regularizer='l2'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics=['acc', keras.metrics.TruePositives(name='tp'), keras.metrics.FalsePositives(name='fp'), keras.metrics.TrueNegatives(name='tn'), keras.metrics.FalseNegatives(name='fn'), keras.metrics.BinaryAccuracy(name='accuracy'),precision,recall,keras.metrics.AUC(name='auc')])

model_vanilla_rnn = model.fit(x_train, y_train, epochs = 12, batch_size=256)
print("\n---------------------------------------------------\n")
print("Model evaluation on the full test set:")
model.evaluate(x_test,y_test, verbose=1, return_dict=True)
print("\n---------------------------------------------------\n")
print("Model evaluation on the SHORT test set:")
model.evaluate(x_test_short,y_test_short, verbose=1, return_dict=True)
print("\n---------------------------------------------------\n")
print("Model evaluation on the MEDIUM test set:")
model.evaluate(x_test_medium,y_test_medium, verbose=1, return_dict=True)
print("\n---------------------------------------------------\n")
print("Model evaluation on the LONG test set:")
model.evaluate(x_test_long,y_test_long, verbose=1, return_dict=True)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

---------------------------------------------------

Model evaluation on the full test set:

---------------------------------------------------

Model evaluation on the SHORT test set:

---------------------------------------------------

Model evaluation on the MEDIUM test set:

---------------------------------------------------

Model evaluation on the LONG test set:


{'loss': 0.5959974527359009,
 'acc': 0.8501070737838745,
 'tp': 9.0,
 'fp': 60.0,
 'tn': 1579.0,
 'fn': 220.0,
 'accuracy': 0.8501070737838745,
 'precision': 0.09604518860578537,
 'recall': 0.03403954952955246,
 'auc': 0.5583977699279785}

## Experimental Models

In [86]:
num_units=15
activation_rnn='tanh'
regularizer = 'l2'
activation_output = 'sigmoid'
loss = 'binary_crossentropy'
optimizer='adam'
metrics = [keras.metrics.TruePositives(name='tp'),
           keras.metrics.FalsePositives(name='fp'),
           keras.metrics.TrueNegatives(name='tn'),
           keras.metrics.FalseNegatives(name='fn'),
           keras.metrics.BinaryAccuracy(name='accuracy'),
           precision,
           recall,
           keras.metrics.AUC(name='auc')]
epochs = 12
batch_size= 256
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weights = dict(enumerate(class_weights))



In [87]:
# Vanilla RNN model 1

model = Sequential()
model.add(Embedding(max_features, num_units))
model.add(SimpleRNN(num_units,input_shape=x_train.shape, activation=activation_rnn,recurrent_regularizer=regularizer))
model.add(Dense(1, activation=activation_output))
model.compile(loss = loss, optimizer=optimizer,metrics=metrics)

model_vanilla_rnn1 = model.fit(x_train, y_train, epochs = epochs, batch_size=batch_size,class_weight=class_weights)
print("\n---------------------------------------------------\n")
print("Model evaluation on the full test set:")
model.evaluate(x_test,y_test, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the SHORT test set:")
model.evaluate(x_test_short,y_test_short, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the MEDIUM test set:")
model.evaluate(x_test_medium,y_test_medium, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the LONG test set:")
model.evaluate(x_test_long,y_test_long, verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

---------------------------------------------------

Model evaluation on the full test set:

---------------------------------------------------

Model evaluation on the SHORT test set:

---------------------------------------------------

Model evaluation on the MEDIUM test set:

---------------------------------------------------

Model evaluation on the LONG test set:


[0.594924807548523,
 18.0,
 107.0,
 1532.0,
 211.0,
 0.8297644257545471,
 0.12711864709854126,
 0.06905496120452881,
 0.5441983342170715]

### LSTM model

In [81]:
# LSTM model

model = Sequential()
model.add(Embedding(max_features, num_units))
model.add(LSTM(num_units,input_shape=x_train.shape, activation=activation_rnn,recurrent_regularizer=regularizer))
model.add(Dense(1, activation=activation_output))
model.compile(loss = loss, optimizer=optimizer,metrics=metrics)

model_lstm = model.fit(x_train, y_train, epochs = epochs, batch_size=batch_size,class_weight=class_weights)
print("\n---------------------------------------------------\n")
print("Model evaluation on the full test set:")
model.evaluate(x_test,y_test, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the SHORT test set:")
model.evaluate(x_test_short,y_test_short, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the MEDIUM test set:")
model.evaluate(x_test_medium,y_test_medium, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the LONG test set:")
model.evaluate(x_test_long,y_test_long, verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

---------------------------------------------------

Model evaluation on the full test set:

---------------------------------------------------

Model evaluation on the SHORT test set:

---------------------------------------------------

Model evaluation on the MEDIUM test set:

---------------------------------------------------

Model evaluation on the LONG test set:


[0.5288983583450317,
 28.0,
 156.0,
 1483.0,
 201.0,
 0.8088865280151367,
 0.13446328043937683,
 0.11905495077371597,
 0.5520247220993042]

### GRU model

In [83]:
# GRU model

model = Sequential()
model.add(Embedding(max_features, num_units))
model.add(GRU(num_units,input_shape=x_train.shape, activation=activation_rnn,recurrent_regularizer=regularizer))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = loss, optimizer=optimizer,metrics=metrics)

model_gru = model.fit(x_train, y_train, epochs = epochs, batch_size=batch_size,class_weight=class_weights)
print("\n---------------------------------------------------\n")
print("Model evaluation on the full test set:")
model.evaluate(x_test,y_test, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the SHORT test set:")
model.evaluate(x_test_short,y_test_short, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the MEDIUM test set:")
model.evaluate(x_test_medium,y_test_medium, verbose=1)
print("\n---------------------------------------------------\n")
print("Model evaluation on the LONG test set:")
model.evaluate(x_test_long,y_test_long, verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

---------------------------------------------------

Model evaluation on the full test set:

---------------------------------------------------

Model evaluation on the SHORT test set:

---------------------------------------------------

Model evaluation on the MEDIUM test set:

---------------------------------------------------

Model evaluation on the LONG test set:


[0.6000329256057739,
 34.0,
 169.0,
 1470.0,
 195.0,
 0.8051391839981079,
 0.1429782211780548,
 0.1388491541147232,
 0.5571322441101074]

# Impact of Pretrained Word Embedding - Code Part II