In [632]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

### Loading embeddings

In [633]:
import io

VOCAB_SIZE = 400001
DIMENSION = 300

emb_dict = {}
word_to_index = {}
embedding_array = np.zeros((VOCAB_SIZE, DIMENSION), dtype='float32')

## Loading glove embeddings 300d
file_path = 'Embeddings/glove.6B/' 
file_name = 'glove.6B.300d.txt'
file = io.open(file_path + file_name, 'r', encoding='utf-8', newline='\n', errors='ignore')


index = 0

## Loading array to pass in embedding layer and word to index layer to keep track of the words
for line in file:
    word, vector = line.split(' ', 1)
    vector = list(map(float, vector.split(' ')))
    
    word_to_index[word.lower()] = index
    embedding_array[index, :] = vector
    
    index+=1


## Importing, Splitting and creating TF Datasets

In [634]:
dataPath = 'Data/data.csv'

all_data = pd.read_csv(dataPath)
all_data['Category'] = pd.get_dummies(all_data['Category'])


In [635]:
## Length of the longest sentence(in words), this is needed to define the input size of the data.
def find_max_length(all_sentences):
    max_len = 0
    mi = 0
    index = 0
    for sentence in all_sentences:
        max_len = max(len(sentence.split(' ')), max_len)
        if(max_len == len(sentence.split(' '))):
            mi = index
        index+=1
    return max_len

## Converts sentences to words indices using word_to_index dictionary, takes care of padding 
def sentences_to_indices(m, max_len, sentences, word_to_index):
    input_array = np.zeros((m, max_len))
    sentence_index = 0
    
    for sentence in sentences:
        words = sentence.lower().split(' ')
        words = np.array(list(map(lambda i : word_to_index[i] if i in word_to_index else word_to_index['unk'], words)))
        words = np.pad(words, (0, max_len-words.shape[0]), constant_values = (0.))
        
        input_array[sentence_index, :] = words
        sentence_index+=1
    return input_array
            
## Returns embedding matrix defined earlier
def get_embedding_matrix():
    return embedding_array


In [636]:
sentences = all_data['Message']
MAX_LEN = find_max_length(sentences)

m = all_data.shape[0]

input_array = sentences_to_indices(m, MAX_LEN, sentences, word_to_index)
labels = all_data['Category'].to_numpy().reshape(m, 1)


In [637]:
X_train, X_test, Y_train, Y_test = train_test_split(input_array, labels, test_size=0.1)

In [638]:
## Secondary function : Returns max_len 
def get_max_len():
    return MAX_LEN

## Model

In [639]:
class AccuracyHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.acc = []
        self.loss_ = []

    def on_epoch_end(self, batch, logs={}):
        self.acc.append(logs.get('accuracy'))
        self.loss_.append(logs.get('loss'))

def NN_Model():
    
    def get_embedding_layer():
        emb = keras.layers.Embedding(VOCAB_SIZE, 300, trainable=False, name='embedLayer')
        emb.build((None,))
        emb.set_weights([get_embedding_matrix()])
        return emb
    
    input_shape = get_max_len()

    input_layer = keras.layers.Input(shape = (input_shape), name='inputLayer')
    embed_layer = get_embedding_layer()
    
    embedded_layer = embed_layer(input_layer)
    
    lstm1 = keras.layers.LSTM(128, return_sequences=True, name='lstm1')(embedded_layer)
    dropout1 = keras.layers.Dropout(rate=0.2, name='dropout1')(lstm1)
        
    lstm2 = keras.layers.LSTM(128, name='lstm2')(lstm1)
    dropout2 = keras.layers.Dropout(rate=0.2, name='dropout2')(lstm2)
        
    dense1 = keras.layers.Dense(5, name='dense1', activation='relu')(dropout2)
    dense2 = keras.layers.Dense(1, name='dense2', activation='relu')(dense1)
        
    sigmoid = keras.layers.Activation(activation='sigmoid', name='sigmoidLayer')(dense2)
        
    model = keras.Model(inputs = [input_layer], outputs=[sigmoid], name='model')
        
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07), metrics=['accuracy'])
    
    return model
    

In [640]:
model = NN_Model()
history = AccuracyHistory()  

In [641]:
model.fit(X_train, Y_train, callbacks=[history], epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1a74997f10>

In [642]:
history.loss_

[0.4113319516181946, 0.3950270414352417, 0.39417096972465515]

## Testing

In [620]:
test_result = model.evaluate(X_test, Y_test, batch_size=128)

