In [61]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

### Loading embeddings

In [178]:
import io

VOCAB_SIZE = 400001
DIMENSION = 50

emb_dict = {}
word_to_index = {}
embedding_array = np.zeros((VOCAB_SIZE, DIMENSION), dtype='float32')

file_path = 'Embeddings/glove.6B/'
file_name = 'glove.6B.50d.txt'
file = io.open(file_path + file_name, 'r', encoding='utf-8', newline='\n', errors='ignore')


index = 0

for line in file:
    word, vector = line.split(' ', 1)
    vector = list(map(float, vector.split(' ')))
    
    word_to_index[word.lower()] = index
    embedding_array[index, :] = vector
    
    index+=1


## Importing, Splitting and creating TF Datasets

In [342]:
dataPath = 'Data/data.csv'

all_data = pd.read_csv(dataPath)
all_data['Category'] = pd.get_dummies(all_data['Category'])


In [343]:
def find_max_length(all_sentences):
    max_len = 0
    mi = 0
    index = 0
    for sentence in all_sentences:
        max_len = max(len(sentence.split(' ')), max_len)
        if(max_len == len(sentence.split(' '))):
            mi = index
        index+=1
    return max_len

def sentences_to_indices(m, max_len, sentences, word_to_index):
    input_array = np.zeros((m, max_len))
    sentence_index = 0
    
    for sentence in sentences:
        words = sentence.lower().split(' ')
        words = np.array(list(map(lambda i : word_to_index[i] if i in word_to_index else word_to_index['unk'], words)))
        words = np.pad(words, (0, max_len-words.shape[0]), constant_values = (0.))
        
        input_array[sentence_index, :] = words
        sentence_index+=1
    return input_array
            

def get_embedding_matrix():
    return embedding_array

def get_max_len():
    return max_len

In [344]:
sentences = all_data['Message']
max_len = find_max_length(sentences)

m = all_data.shape[0]

input_array = sentences_to_indices(m, max_len, sentences, word_to_index)
labels = all_data['Category'].to_numpy().reshape(m, 1)





In [345]:
X_train, X_test, Y_train, Y_test = train_test_split(input_array, labels, test_size=0.1)
# train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, Y_train.values))
# test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, Y_test.values))

In [346]:
X_test.shape

(558, 171)

## Model

In [593]:
class NN_Model():
        
    def __init__(self):
        
        self.max_len = get_max_len()

        
        self.input_layer = keras.layers.Input(shape = (max_len), name='inputLayer')
        
        self.embed_layer = self.get_embedding_layer()
        self.embedded_layer = self.embed_layer(self.input_layer)
        
        self.lstm1 = keras.layers.LSTM(128, return_sequences=True, name='lstm1')(self.embedded_layer)
#         self.dropout1 = keras.layers.Dropout(rate=0.5, name='dropout1')(self.lstm1)
        
        self.lstm2 = keras.layers.LSTM(128, name='lstm2')(self.lstm1)
        self.dropout2 = keras.layers.Dropout(rate=0.5, name='dropout2')(self.lstm2)
        
        self.dense1 = keras.layers.Dense(5, name='dense1', activation='relu')(self.dropout2)
        self.dense2 = keras.layers.Dense(1, name='dense2', activation='relu')(self.dense1)
        
        self.sigmoid = keras.layers.Activation(activation='sigmoid', name='sigmoidLayer')(self.dense2)
        
        self.model = keras.Model(inputs = [self.input_layer], outputs=[self.sigmoid], name='whole_model')
        
        self.model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07), metrics=['accuracy'])
    
    def get_embedding_layer(self):
        emb = keras.layers.Embedding(VOCAB_SIZE, 50, trainable=False, name='embedLayer')
        emb.build((None,))
        emb.set_weights([get_embedding_matrix()])
        return emb
    
    def get_model(self):
        return self.model
    
    def run(self, inputs):
        return self.model(inputs)
    
    
class AccuracyHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.acc = []
        self.loss_ = []

    def on_epoch_end(self, batch, logs={}):
        self.acc.append(logs.get('accuracy'))
        self.loss_.append(logs.get('loss'))

     
        

In [594]:
model = NN_Model()
model = model.get_model()
history = AccuracyHistory()  

In [None]:
model.fit(X_train, Y_train, callbacks=[history], epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100

In [None]:
history.loss_