### Bi-directional LSTM with max pooling 

Author: Jeanne Elizabeth Daniel

November 2019

We make use of a bi-directional LSTM networks that extends the modelling capabilities of the vanilla LSTM. This approach is similar to that of InferSent (Conneau et al. 2017) where the authors combine bi-directional LSTM models with pooling layers to produce high-quality sentence embeddings. In addition to InferSent, we attach a dense classification layer after the pooling layers. 

In [1]:
import sys
import os
#sys.path.append(os.path.join(\"..\")) # path to source relative to current directory"

In [3]:
import numpy as np
import gensim

In [4]:
import preprocess_data
import pandas as pd

In [22]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed, Input, Flatten, AdditiveAttention

In [55]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
data = pd.read_csv('dataset_7B', delimiter = ';', engine = 'python')
data_text = data.loc[data['set'] == 'Train'][['helpdesk_question']]
number_of_classes = data.loc[data['set'] == 'Train']['helpdesk_reply'].value_counts().shape[0]
data = data[['helpdesk_question', 'helpdesk_reply', 'set', 'low_resource']] 

In [6]:
responses = pd.DataFrame(data.loc[data['set'] == 'Train']['helpdesk_reply'].value_counts()).reset_index()
responses['reply'] = responses['index']
responses['index'] = responses.index
responses = dict(responses.set_index('reply')['index'])

In [7]:
len(responses)

89

In [8]:
data_text['index'] = data_text.index
documents = data_text

In [9]:
dictionary = preprocess_data.create_dictionary(data_text, 1, 0.25, 95000) #our entire vocabulary

In [10]:
df_train = data.loc[data['set'] == 'Train']
df_train = df_train.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_valid = data.loc[data['set'] == 'Valid']
df_valid = df_valid.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_test = data.loc[data['set'] == 'Test']
df_test = df_test.reset_index()[['helpdesk_question', 'helpdesk_reply']]

df_LR = data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True') ]
df_LR = df_LR.reset_index()[['helpdesk_question', 'helpdesk_reply']]

In [11]:
df_train.shape

(96412, 2)

In [12]:
unique_words = dictionary

In [13]:
len(unique_words) + 1

57545

In [14]:
max_length = 30
min_token_length = 0

In [15]:
word_to_id, id_to_word = preprocess_data.create_lookup_tables(unique_words)

#### Transforming the input sentence into a sequence of word IDs

In [16]:
train_x_word_ids = []
for question in df_train['helpdesk_question'].apply(preprocess_data.preprocess_question, 
                                                    args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    train_x_word_ids.append(np.array(word_ids, dtype = float))
train_x_word_ids = np.stack(train_x_word_ids)
print(train_x_word_ids.shape)
    
val_x_word_ids = []
for question in data['helpdesk_question'].loc[data['set'] == 'Valid'].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    val_x_word_ids.append(np.array(word_ids, dtype = float))
val_x_word_ids = np.stack(val_x_word_ids)

test_x_word_ids = []
for question in data['helpdesk_question'].loc[data['set'] == 'Test'].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    test_x_word_ids.append(np.array(word_ids, dtype = float))
    
test_x_word_ids = np.stack(test_x_word_ids)

LR_x_word_ids = []
for question in data['helpdesk_question'].loc[(data['set'] == 'Test') & 
                                              (data['low_resource'] == 'True')].apply(preprocess_data.preprocess_question, 
                                                                          args = [unique_words, min_token_length]):
    word_ids = preprocess_data.transform_sequence_to_word_ids(question, word_to_id)
    LR_x_word_ids.append(np.array(word_ids, dtype = float))
LR_x_word_ids = np.stack(LR_x_word_ids)

(96412, 30, 1)


In [17]:
def get_dummies(reply, all_responses):
    
    """ Constructs a one-hot vector for replies
    
    Args:
        reply: query item 
        all_responses: dict containing all the template responses with their corresponding IDs
    
    Return:
        a one-hot vector where the corresponding ID of the reply is the one-hot index
    
    """
    
    Y = np.zeros(len(all_responses), dtype = int)
    Y[all_responses[reply]] += 1
    return Y 

In [18]:
train_y = np.array(list(df_train['helpdesk_reply'].apply(get_dummies, args = [responses])))
valid_y = np.array(list(df_valid['helpdesk_reply'].apply(get_dummies, args = [responses])))
test_y  = np.array(list(df_test['helpdesk_reply'].apply(get_dummies,  args = [responses])))
LR_y    = np.array(list(df_LR['helpdesk_reply'].apply(get_dummies,    args = [responses])))

In [19]:
train_x_word_ids = train_x_word_ids.reshape(train_x_word_ids.shape[:-1])
val_x_word_ids   = val_x_word_ids.reshape(val_x_word_ids.shape[:-1])
test_x_word_ids  = test_x_word_ids.reshape(test_x_word_ids.shape[:-1])
LR_x_word_ids    = LR_x_word_ids.reshape(LR_x_word_ids.shape[:-1])

#### Transform vectors where the input sentence yields a sequence of length 0

In [20]:
train_zero_vectors = np.where(train_x_word_ids.sum(axis = 1) == 0.0)[0]
for t in range(train_zero_vectors.shape[0]):
    train_x_word_ids[train_zero_vectors[t]][0] += 1

In [21]:
val_zero_vectors = np.where(val_x_word_ids.sum(axis = 1) == 0.0)[0]
for t in range(val_zero_vectors.shape[0]):
    val_x_word_ids[val_zero_vectors[t]][0] += 1

#### Bi-directional LSTM with max pooling

The network consists of an embedding layer, followed by a dropout layer. This is followed by an bi-directional LSTM layer that outputs a variable-length sequence of embedding vectors. To construct a single sentence embedding from the sequence we use max pooling. The sentence embedding is then fed to a classification layer. We train with a dropout rate of 0.5 and batch size of 32. During training we use early stopping and Adadelta as our optimization algorithm. This network has an embedding of size 300 and 512 hidden units in the biLSTM network. 

In [39]:
def bilstm_max_pooling_network(max_features, input_length=30, embed_dim=100, lstm_units=512):
    
    """ Constructs a bi-directional LSTM network with max pooling
    
    Args:
        max_features: size of vocabulary
        input_length: length of input sequence
        embed_dim: dimension of the embedding vector
        lstm_units: number of hidden units in biLSTM
        
        
    Returns:
        An biLSTM model
    
    """
    
    inputs = Input(shape=(input_length, ))
    x = Embedding(max_features, output_dim=embed_dim, input_length=input_length, mask_zero=True)(inputs)
    x = (Dropout(rate = 0.5))(x)
    x = Bidirectional(LSTM(lstm_units, activation = 'tanh', return_sequences=True,
                           dropout=0.25, recurrent_dropout=0.5))(x)
    x = GlobalMaxPooling1D()(x)        
    outputs = Dense(89, activation='softmax')(x)
    return Model(inputs=inputs, outputs=outputs)

In [53]:
max_features = len(unique_words) + 1
model = bilstm_max_pooling_network(max_features, embed_dim=300, input_length=30, lstm_units = 512)

In [54]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 300)           17263500  
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 300)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 30, 1024)          3330048   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 89)                91225     
Total params: 20,684,773
Trainable params: 20,684,773
Non-trainable params: 0
_______________________________________________

### Training

In [56]:
es = EarlyStopping(monitor='val_accuracy', verbose=1, restore_best_weights=True, patience=10)
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.5, rho=0.95),
              metrics=['accuracy'])

In [58]:
model.fit(train_x_word_ids, train_y, 
          batch_size=32,
          epochs=500,
          callbacks=[es],
          validation_data=[val_x_word_ids, valid_y])

Train on 96412 samples, validate on 31955 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 00054: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fd3f8e424a8>

### Test score 

In [59]:
def classifier_score_top_1(word_ids, y_true, model):
    
    """ Computes top-1 classification accuracy for model.
    
    Args:
        word_ids: matrix where each row is 
        y_true: true labels
        model: trained model
    
    Returns:
        None
    
    """
    
    score = 0
    probs = model.predict(word_ids)
    for i in range(word_ids.shape[0]):
        if y_true[i].argmax() == np.argsort(probs[i])[-1]:
            score += 1
        
    print("Overall Accuracy:", score/word_ids.shape[0])

In [60]:
classifier_score_top_1(test_x_word_ids, test_y, model)

Overall Accuracy: 0.6057766884869543


### LR test score

In [61]:
classifier_score_top_1(LR_x_word_ids, LR_y, model)

Overall Accuracy: 0.521484375


### Top-5 accuracy

In [63]:
def classifier_score_top_5(word_ids, y_true, model):
    
    """ Computes top-5 classification accuracy for model.
    
    Args:
        word_ids: matrix where each row is 
        y_true: true labels
        model: trained model
        
    Returns:
        None
    
    """
    
    score = 0
    probs = model.predict(word_ids)
    for i in range(word_ids.shape[0]):
        if y_true[i].argmax() in np.argsort(probs[i])[-5:]:
            score += 1
        
    print("Overall Accuracy:", score/word_ids.shape[0])

In [64]:
classifier_score_top_5(test_x_word_ids, test_y, model)

Overall Accuracy: 0.8976824993019576


In [65]:
classifier_score_top_5(LR_x_word_ids, LR_y, model)

Overall Accuracy: 0.7982271634615384
