### LSTM with 128 hidden units

Author: Jeanne Elizabeth Daniel

November 2019

We employ the humble long short-term memory network with 512 hidden units to model the input sequence of words. The LSTM was introduced by Hochreiter and Schmidhuber (1997) to address the shortcomings of the original recurrent neural network (RNN). The LSTM's architecture is similar to that of the RNN, but with more parameters, such as gating units and an internal state unit that explicitly address the long-term dependency problem of the RNN. 

In [1]:
import sys
import os
#sys.path.append(os.path.join(\"..\")) # path to source relative to current directory"

In [3]:
import numpy as np
import gensim

In [4]:
import preprocess_data
import pandas as pd

In [22]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed, Input, Flatten, AdditiveAttention

In [49]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
data = pd.read_csv('dataset_7B', delimiter = ';', engine = 'python')
data_text = data.loc[data['set'] == 'Train'][['helpdesk_question']]
number_of_classes = data.loc[data['set'] == 'Train']['helpdesk_reply'].value_counts().shape[0]
data = data[['helpdesk_question', 'helpdesk_reply', 'set', 'low_resource']] 

In [6]:
responses = pd.DataFrame(data.loc[data['set'] == 'Train']['helpdesk_reply'].value_counts()).reset_index()
responses['reply'] = responses['index']
responses['index'] = responses.index
responses = dict(responses.set_index('reply')['index'])

In [7]:
len(responses)

89

In [8]:
data_text['index'] = data_text.index
documents = data_text

In [9]:
dictionary = preprocess_data.create_dictionary(data_text, 1, 0.25, 95000) #our entire vocabulary

In [10]:
df_train = data.loc[data['set'] == 'Train']
df_train = df_train.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_valid = data.loc[data['set'] == 'Valid']
df_valid = df_valid.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_test = data.loc[data['set'] == 'Test']
df_test = df_test.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_LR = data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True') ]
df_LR = df_LR.reset_index()[['helpdesk_question', 'helpdesk_reply']]

In [11]:
df_train.shape

(96412, 2)

In [12]:
unique_words = dictionary

In [13]:
len(unique_words) + 1

57545

In [14]:
max_length = 30
min_token_length = 0

In [15]:
word_to_id, id_to_word = preprocess_data.create_lookup_tables(unique_words)

#### Transforming the input sentence into a sequence of word IDs

In [16]:
train_x_word_ids = []
for question in df_train['helpdesk_question'].apply(preprocess_data.preprocess_question, 
                                                    args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    train_x_word_ids.append(np.array(word_ids, dtype = float))
train_x_word_ids = np.stack(train_x_word_ids)
print(train_x_word_ids.shape)
    
val_x_word_ids = []
for question in data['helpdesk_question'].loc[data['set'] == 'Valid'].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    val_x_word_ids.append(np.array(word_ids, dtype = float))
val_x_word_ids = np.stack(val_x_word_ids)

test_x_word_ids = []
for question in data['helpdesk_question'].loc[data['set'] == 'Test'].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    test_x_word_ids.append(np.array(word_ids, dtype = float))
    
test_x_word_ids = np.stack(test_x_word_ids)

LR_x_word_ids = []
for question in data['helpdesk_question'].loc[(data['set'] == 'Test') & 
                                              (data['low_resource'] == 'True')].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    LR_x_word_ids.append(np.array(word_ids, dtype = float))
LR_x_word_ids = np.stack(LR_x_word_ids)

(96412, 30, 1)


In [17]:
def get_dummies(reply, all_responses):
    
    """ Constructs a one-hot vector for replies
    
    Args:
        reply: query item 
        all_responses: dict containing all the template responses with their corresponding IDs
    
    Return:
        a one-hot vector where the corresponding ID of the reply is the one-hot index
    
    """
    
    Y = np.zeros(len(all_responses), dtype = int)
    Y[all_responses[reply]] += 1
    return Y 

In [18]:
train_y = np.array(list(df_train['helpdesk_reply'].apply(get_dummies, args = [responses])))
valid_y = np.array(list(df_valid['helpdesk_reply'].apply(get_dummies, args = [responses])))
test_y  = np.array(list(df_test['helpdesk_reply'].apply(get_dummies,  args = [responses])))
LR_y    = np.array(list(df_LR['helpdesk_reply'].apply(get_dummies,         args = [responses])))

In [19]:
train_x_word_ids = train_x_word_ids.reshape(train_x_word_ids.shape[:-1])
val_x_word_ids   = val_x_word_ids.reshape(val_x_word_ids.shape[:-1])
test_x_word_ids  = test_x_word_ids.reshape(test_x_word_ids.shape[:-1])
LR_x_word_ids    = LR_x_word_ids.reshape(LR_x_word_ids.shape[:-1])

#### Vanilla LSTMs using the final hidden state as sentence embedding. 

The network consists of an embedding layer, followed by a dropout layer, followed by an LSTM network. The final hidden LSTM state is fed to a dense classification layer.
We train with a dropout rate of 0.25 and batch size of 32. During training we use early stopping and Adadelta as our optimization algorithm. This network has an embedding of size 300 and 128 hidden units in the LSTM network. 

In [40]:
def vanilla_lstm(max_features, output_dim=100, input_length = 30, lstm_units = 128):
    
    """ Constructs an LSTM classifier with an embedding and dropout layer preceding the LSTM network. 
    
    Args:
        max_features: size of vocabulary
        output_dim: dimension of embedding vector
        input_length: length of input sequence
        lstm_units: number of hidden units in LSTM
    
    Returns:
        An LSTM model
    
    
    """
    
    model = Sequential()
    model.add(Embedding(max_features, output_dim=output_dim, input_length = input_length, mask_zero=True ))
    model.add(Dropout(rate = 0.25))
    model.add(LSTM(lstm_units, activation = 'tanh', return_sequences = False, input_shape = (30,), 
               dropout = 0.25, recurrent_dropout = 0.5))
    model.add(Dense(89, activation='softmax'))
    return model

In [47]:
max_features = len(unique_words) + 1

model = vanilla_lstm(max_features, output_dim=300, input_length=30, lstm_units = 128)

In [48]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           17263500  
_________________________________________________________________
dropout_3 (Dropout)          (None, 30, 300)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_3 (Dense)              (None, 89)                11481     
Total params: 17,494,629
Trainable params: 17,494,629
Non-trainable params: 0
_________________________________________________________________


### Training

In [50]:
es = EarlyStopping(monitor='val_accuracy', verbose=1, restore_best_weights=True, patience=20)
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adadelta(learning_rate=1.0, rho=0.95),
              metrics=['accuracy'])

In [52]:
model.fit(train_x_word_ids, train_y,
          batch_size=32,
          epochs=500,
          callbacks=[es],
          validation_data=[val_x_word_ids, valid_y])

Train on 96412 samples, validate on 31955 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 00039: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f5cdb162b38>

### Test score 

In [53]:
preds = model.predict_classes(test_x_word_ids)
pd.Series(np.argmax(test_y, axis = 1) == preds).value_counts()/test_y.shape[0]

True     0.604536
False    0.395464
dtype: float64

### LR test score

In [54]:
preds = model.predict_classes(LR_x_word_ids)
pd.Series(np.argmax(LR_y, axis = 1) == preds).value_counts()/LR_y.shape[0]

True     0.522987
False    0.477013
dtype: float64

In [55]:
def classifier_score_top_5(word_ids, y_true, model):
    
    """ Computes top-5 classification accuracy for model.
    
    Args:
        word_ids: matrix where each row is 
        y_true:
        model:
    
    Returns:
        None
    
    """
    
    score = 0
    probs = model.predict(word_ids)
    for i in range(word_ids.shape[0]):
        if y_true[i].argmax() in np.argsort(probs[i])[-5:]:
            score += 1
        
    print("Overall Accuracy:", score/word_ids.shape[0])

In [56]:
classifier_score_top_5(test_x_word_ids, test_y, model)

Overall Accuracy: 0.8960692458039897


In [57]:
classifier_score_top_5(LR_x_word_ids, LR_y, model)

Overall Accuracy: 0.8093449519230769
