## Natural Language Inference Task: LSTM & Sum Embeddings

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import re
np.random.seed(1234)
random.seed(1234)

### Tensorflow LSTM 

If there are no processed data in the directory: 

In [2]:
df_train = pd.read_csv('snli/snli_1.0_train.txt', delimiter = "\t")
df_dev = pd.read_csv('snli/snli_1.0_dev.txt', delimiter = "\t")
df_test = pd.read_csv('snli/snli_1.0_test.txt', delimiter = "\t")


In [3]:
df_train = df_train[["gold_label", "sentence1", "sentence2"]].dropna(axis=0).rename(columns={"sentence1": "premise", "sentence2": "hypothesis"})
df_dev = df_dev[["gold_label", "sentence1", "sentence2"]].dropna(axis=0).rename(columns={"sentence1": "premise", "sentence2": "hypothesis"})
df_test = df_test[["gold_label", "sentence1", "sentence2"]].dropna(axis=0).rename(columns={"sentence1": "premise", "sentence2": "hypothesis"})

df_train = df_train[df_train["gold_label"] != "-"]
df_dev = df_dev[df_dev["gold_label"] != "-"]
df_test = df_test[df_test["gold_label"] != "-"]


In [4]:
df_train.shape

(549361, 3)

In [5]:
df_dev.shape

(9842, 3)

In [6]:
df_test.shape

(9824, 3)

In [7]:
# reduce the training size
training_data_length = 45948
df_train = df_train.iloc[:training_data_length]

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
df_train.head(2)

Unnamed: 0,gold_label,premise,hypothesis
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."


In [11]:
tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True, split=' ', char_level=False, oov_token=None,
                      document_count=0)

tokenizer.fit_on_texts(df_train["premise"].values + df_train["hypothesis"].values)


In [12]:
df_all = (df_train["premise"]+df_train["hypothesis"]+df_dev["premise"]+df_dev["hypothesis"]+df_test["premise"]+df_test["hypothesis"])

max_length = int(df_all.str.split().str.len().max())


In [13]:
# apply transformation to sequences and padding

train_prem_encoded = pad_sequences(tokenizer.texts_to_sequences(df_train["premise"]),
                                   maxlen=max_length,
                                   padding="post")
train_hyp_encoded = pad_sequences(tokenizer.texts_to_sequences(df_train["hypothesis"]),
                                  maxlen=max_length, 
                                  padding="post")

dev_prem_encoded = pad_sequences(tokenizer.texts_to_sequences(df_dev["premise"]),
                                 maxlen=max_length,
                                 padding="post")
dev_hyp_encoded = pad_sequences(tokenizer.texts_to_sequences(df_dev["hypothesis"]),
                                maxlen=max_length,
                                padding="post")

test_prem_encoded = pad_sequences(tokenizer.texts_to_sequences(df_test["premise"]),
                                  maxlen=max_length,
                                  padding="post")
test_hyp_encoded = pad_sequences(tokenizer.texts_to_sequences(df_test["hypothesis"]),
                                 maxlen=max_length,
                                 padding="post")

In [14]:
from sklearn import preprocessing

labels = list(df_train["gold_label"].value_counts().index)
le = preprocessing.LabelEncoder()
le.fit(labels)

df_train["gold_label"] = le.transform(df_train["gold_label"])
df_dev["gold_label"] = le.transform(df_dev["gold_label"])
df_test["gold_label"] = le.transform(df_test["gold_label"])

train_labels = df_train["gold_label"].values
dev_labels = df_dev["gold_label"].values
test_labels = df_test["gold_label"].values

In [15]:
train_labels

array([2, 0, 1, ..., 0, 2, 1])

In [16]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, concatenate, Input, Dense, Embedding, Bidirectional, Dropout, Attention, TimeDistributed, SpatialDropout1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import concatenate


### Create Word Embeddings using Glove

The glove.840B.300d.txt file was retrieved from the kaggle data page: https://www.kaggle.com/takuok/glove840b300dtxt.
The below portion of the code, which processes Glove and creates word embedding matrix, was adapted from KERAS SNLI baseline example (link: https://github.com/Smerity/keras_snli/blob/master/snli_rnn.py).


In [17]:
# hyperparameters

embedding_hidden_size = 300 # needs to be 300 since the dimension of Glove vector embeddings is (300, )
n_labels = len(df_train["gold_label"].value_counts().index)
vocab_size = len(tokenizer.word_counts)
activation_function = "relu" # maybe try tanh like the one in the paper 
dropout_lstm = 0.5
dropout_rate = 0.2
optimizer = 'Adadelta'
batch_size = 1
num_epoch = 1
l2_penalty_rate = 0.001
l2_penalty = l2(l2_penalty_rate)

##### If there is no processed glove embedding matrix

In [152]:
# open glove.840B.300d.txt and preprocess it to make it usable
# this will take some time 

embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


In [153]:
# create embedding matrix 
GLOVE_STORE = 'precomputed_glove_weights_shortened'

embedding_matrix = np.zeros((vocab_size+1, embedding_hidden_size)) #(39832, 300)
for word, id_ in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word) #embedding_vector.shape - (300,)
    if embedding_vector is not None: # words not found in embedding index will be all-zeros.
        embedding_matrix[id_] = embedding_vector
    else:
        print('Missing from GloVe: {}'.format(word))
        

Missing from GloVe: man's
Missing from GloVe: woman's
Missing from GloVe: children's
Missing from GloVe: girl's
Missing from GloVe: farmer's
Missing from GloVe: women's
Missing from GloVe: child's
Missing from GloVe: giraffe's
Missing from GloVe: there's
Missing from GloVe: streeta
Missing from GloVe: people's
Missing from GloVe: boy's
Missing from GloVe: who's
Missing from GloVe: person's
Missing from GloVe: mother's
Missing from GloVe: watera
Missing from GloVe: someone's
Missing from GloVe: he's
Missing from GloVe: dad's
Missing from GloVe: they're
Missing from GloVe: bikea
Missing from GloVe: horse's
Missing from GloVe: john's
Missing from GloVe: american's
Missing from GloVe: says'
Missing from GloVe: isn't
Missing from GloVe: joule'
Missing from GloVe: rachofsky
Missing from GloVe: beacha
Missing from GloVe: amazementa
Missing from GloVe: realtors'
Missing from GloVe: rottwieler
Missing from GloVe: groom's
Missing from GloVe: waterthe
Missing from GloVe: buildinga
Missing from Gl

Missing from GloVe: logtwo
Missing from GloVe: arcadea
Missing from GloVe: personthe
Missing from GloVe: camerasthe
Missing from GloVe: beachtwo
Missing from GloVe: gentlemen's'
Missing from GloVe: muchthe
Missing from GloVe: freesbies
Missing from GloVe: boardan
Missing from GloVe: sidethree
Missing from GloVe: forestthe
Missing from GloVe: villagethe
Missing from GloVe: couchdog
Missing from GloVe: outfita
Missing from GloVe: scootera
Missing from GloVe: geara
Missing from GloVe: staircasepeople
Missing from GloVe: kimino
Missing from GloVe: hospitala
Missing from GloVe: carsa
Missing from GloVe: i'll
Missing from GloVe: rallya
Missing from GloVe: chairsthe
Missing from GloVe: alcoholsome
Missing from GloVe: runner's
Missing from GloVe: seat4
Missing from GloVe: jumpthe
Missing from GloVe: seidwalk
Missing from GloVe: sidewalkthe
Missing from GloVe: paintinga
Missing from GloVe: somethingsome
Missing from GloVe: midairthe
Missing from GloVe: telephonea
Missing from GloVe: tree's
Miss

Missing from GloVe: mom's
Missing from GloVe: 1980's
Missing from GloVe: sinkboy
Missing from GloVe: stagethey
Missing from GloVe: watchingthere
Missing from GloVe: watchingtoday's
Missing from GloVe: watchingthe
Missing from GloVe: scrimmagethere
Missing from GloVe: taekondo
Missing from GloVe: princecess
Missing from GloVe: 40's
Missing from GloVe: structurethe
Missing from GloVe: structurethere
Missing from GloVe: structurechildren
Missing from GloVe: seatsome
Missing from GloVe: seatwomen
Missing from GloVe: seatsullen
Missing from GloVe: themtwo
Missing from GloVe: otuside
Missing from GloVe: cat's
Missing from GloVe: spelunkersthe
Missing from GloVe: spelunkersthree
Missing from GloVe: spelunkersa
Missing from GloVe: fireworkspeople
Missing from GloVe: fireworkssad
Missing from GloVe: fireworksdogs
Missing from GloVe: sidewalkpeople
Missing from GloVe: arcadethe
Missing from GloVe: quarterback's
Missing from GloVe: year's
Missing from GloVe: cementi
Missing from GloVe: cjild
Miss

In [154]:
np.save(GLOVE_STORE, embedding_matrix)

##### If there is processed glove embedding matrix in the directory

In [18]:
GLOVE_STORE = 'precomputed_glove_weights_shortened'


In [19]:
embedding_matrix = np.load(GLOVE_STORE + '.npy')

In [20]:
embedding_matrix.shape

(13013, 300)

In [21]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, concatenate, BatchNormalization, Input, Dense, Embedding, Bidirectional, Dropout, Attention, TimeDistributed, SpatialDropout1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import concatenate



### LSTM (concatenation only)

In [159]:
premise = Input(shape=(max_length,), dtype='int32')
hypothesis = Input(shape=(max_length,), dtype='int32')

premise2 = Embedding(vocab_size+1, 
                     embedding_hidden_size, 
                     weights=[embedding_matrix], 
                     input_length=max_length, 
                     trainable=False)(premise)
hypothesis2 = Embedding(vocab_size+1, 
                     embedding_hidden_size, 
                     weights=[embedding_matrix], 
                     input_length=max_length, 
                     trainable=False)(hypothesis)

premise3 = Dense(units=embedding_hidden_size, 
                 activation=activation_function)(premise2)
hypothesis3 = Dense(units=embedding_hidden_size, 
                    activation=activation_function)(hypothesis2)

premise4 = LSTM(units=embedding_hidden_size, 
                dropout=dropout_lstm)(premise3)
hypothesis4 = LSTM(units=embedding_hidden_size, 
                   dropout=dropout_lstm)(hypothesis3)

premise5 = BatchNormalization()(premise4)
hypothesis5 = BatchNormalization()(hypothesis4)

joint = concatenate([premise5, hypothesis5], axis=1)
joint = Dropout(dropout_rate)(joint)

#####
#for i in range(3):
    #print(i)
    #joint = joined_dense_layer(joint)
    #joint = dropout_layer(joint)
    #joint = batch_normalization(joint)

####

pred = Dense(units=n_labels, 
             activation="softmax")(joint)

In [160]:
model = Model(inputs=[premise, hypothesis], outputs=pred) 
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           [(None, 152)]        0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           [(None, 152)]        0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 152, 300)     3903900     input_26[0][0]                   
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 152, 300)     3903900     input_27[0][0]                   
____________________________________________________________________________________________

In [161]:
import os

checkpoint_path = "training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [162]:
train_prem_encoded.shape

(45948, 152)

In [163]:
model.fit(x=[train_prem_encoded, train_hyp_encoded], 
          y=train_labels, 
          batch_size=batch_size,
          epochs=2, 
          verbose=1, 
          callbacks=[cp_callback])

Epoch 1/2

Epoch 00001: saving model to training_2/cp.ckpt
Epoch 2/2

Epoch 00002: saving model to training_2/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7feb950c83d0>

In [None]:
# manually save weights
path = 'checkpoints/lstm_training1'
model.save_weights(path)

In [None]:
# if loading from checkpoints
model = create_model()
model.load_weights(path)

In [164]:
train_prem_encoded.shape

(45948, 152)

In [165]:
loss, accuracy = model.evaluate([dev_prem_encoded, dev_hyp_encoded], dev_labels, batch_size=1)



In [166]:
# epochs = 2
print(loss)
print(accuracy)

1.098552942276001
0.33306238055229187


### Sum Embeddings (Concatenation only)

In [167]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, concatenate, BatchNormalization, Input, Dense, Embedding, Bidirectional, Dropout, Attention, TimeDistributed, SpatialDropout1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import concatenate

from tensorflow.keras.layers import Lambda


In [168]:
premise = Input(shape=(max_length,), dtype='int32')
hypothesis = Input(shape=(max_length,), dtype='int32')

premise2 = Embedding(vocab_size+1, 
                     embedding_hidden_size, 
                     weights=[embedding_matrix], 
                     input_length=max_length, 
                     trainable=False)(premise)
hypothesis2 = Embedding(vocab_size+1, 
                     embedding_hidden_size, 
                     weights=[embedding_matrix], 
                     input_length=max_length, 
                     trainable=False)(hypothesis)

premise3 = Dense(units=embedding_hidden_size, 
                 activation=activation_function)(premise2)
hypothesis3 = Dense(units=embedding_hidden_size, 
                    activation=activation_function)(hypothesis2)

premise4 = Lambda(lambda x: tf.keras.backend.sum(x, axis=1, keepdims=False), 
                  output_shape=(embedding_hidden_size,))(premise3)
hypothesis4 = Lambda(lambda x: tf.keras.backend.sum(x, axis=1, keepdims=False), 
                     output_shape=(embedding_hidden_size,))(hypothesis3)

premise5 = BatchNormalization()(premise4)
hypothesis5 = BatchNormalization()(hypothesis4)

joint = concatenate([premise5, hypothesis5], axis=1)
joint = Dropout(dropout_rate)(joint)

#####
#for i in range(3):
    #print(i)
    #joint = joined_dense_layer(joint)
    #joint = dropout_layer(joint)
    #joint = batch_normalization(joint)

####

pred = Dense(units=n_labels, 
             activation="softmax")(joint)

In [169]:
model_sum_embeddings = Model(inputs=[premise, hypothesis], 
                             outputs=pred) 
model_sum_embeddings.compile(optimizer=optimizer, 
                             loss='sparse_categorical_crossentropy', 
                             metrics=['accuracy'])

model_sum_embeddings.summary()


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 152)]        0                                            
__________________________________________________________________________________________________
input_29 (InputLayer)           [(None, 152)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 152, 300)     3903900     input_28[0][0]                   
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 152, 300)     3903900     input_29[0][0]                   
____________________________________________________________________________________________

In [170]:
import os
checkpoint_path = "training_3/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)



In [171]:
model_sum_embeddings.fit(x=[train_prem_encoded, train_hyp_encoded], 
                         y=train_labels, 
                         batch_size=batch_size,
                         epochs=2, 
                         verbose=1, 
                         callbacks=[cp_callback])

Epoch 1/2

Epoch 00001: saving model to training_3/cp.ckpt
Epoch 2/2

Epoch 00002: saving model to training_3/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7febae2db4f0>

In [172]:
loss, accuracy = model_sum_embeddings.evaluate([dev_prem_encoded, dev_hyp_encoded], dev_labels, batch_size=1)



In [173]:
# sum_embeddings, epochs = 2
print(loss)
print(accuracy)

31.668176651000977
0.36567771434783936


In [200]:
#from keras.utils import plot_model plot_model(model, to_file='model.png')

tf.keras.utils.plot_model(
    model_sum_embeddings, to_file='model.png'
)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


### Conneau et al. 3 matching methods

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, concatenate, BatchNormalization, Input, Dense, Embedding, Dropout, Attention, TimeDistributed, SpatialDropout1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import concatenate

from tensorflow.keras.layers import Lambda


In [23]:
def absolute_difference(tensors):
    x, y = tensors
    return tf.abs(x - y)

In [25]:
premise_a = Input(shape=(max_length,), dtype='int32')
hypothesis_a = Input(shape=(max_length,), dtype='int32')

premise2_a = Embedding(vocab_size+1, 
                     embedding_hidden_size, 
                     weights=[embedding_matrix], 
                     input_length=max_length, 
                     trainable=False)(premise_a)
hypothesis2_a = Embedding(vocab_size+1, 
                        embedding_hidden_size, 
                        weights=[embedding_matrix], 
                        input_length=max_length, 
                        trainable=False)(hypothesis_a)

premise3_a = Dense(units=embedding_hidden_size, 
                   activation=activation_function)(premise2_a)
hypothesis3_a = Dense(units=embedding_hidden_size, 
                    activation=activation_function)(hypothesis2_a)

premise4_a = LSTM(units=embedding_hidden_size, 
                dropout=dropout_lstm)(premise3_a)
hypothesis4_a = LSTM(units=embedding_hidden_size, 
                   dropout=dropout_lstm)(hypothesis3_a)

premise5_a = BatchNormalization()(premise4_a)
hypothesis5_a = BatchNormalization()(hypothesis4_a)

joint_concatenate = concatenate([premise5_a, hypothesis5_a], axis=1)
joint_concatenate2 = Dropout(dropout_rate)(joint_concatenate)

joint_difference = Lambda(absolute_difference)([premise5_a, hypothesis5_a])
joint_difference2 = Dropout(dropout_rate)(joint_difference)

joint_multiplied = tf.keras.layers.Multiply()([premise5_a, hypothesis5_a])
joint_multiplied2 = Dropout(dropout_rate)(joint_multiplied)

all_concatenate = concatenate([joint_concatenate2, joint_difference2, joint_multiplied2], axis=1)
all_concatenate2 = Dropout(dropout_rate)(all_concatenate)

#####
#for i in range(3):
    #print(i)
    #joint = joined_dense_layer(joint)
    #joint = dropout_layer(joint)
    #joint = batch_normalization(joint)

####

pred = Dense(units=n_labels, 
             activation="softmax")(all_concatenate2)

In [27]:
model_conneau = Model(inputs=[premise_a, hypothesis_a], outputs=pred) 
model_conneau.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_conneau.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 152)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 152)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 152, 300)     3903900     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 152, 300)     3903900     input_2[0][0]                    
______________________________________________________________________________________________

In [28]:
import os
checkpoint_path = "training_4/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


In [29]:
model_conneau.fit(x=[train_prem_encoded, train_hyp_encoded], 
                  y=train_labels, 
                  batch_size=1,
                  epochs=2, 
                  verbose=1, 
                  callbacks=[cp_callback])


Epoch 1/2

Epoch 00001: saving model to training_4/cp.ckpt
Epoch 2/2

Epoch 00002: saving model to training_4/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7fdb1b562fa0>

In [30]:
loss, accuracy = model_conneau.evaluate([dev_prem_encoded, dev_hyp_encoded], dev_labels, batch_size=1)



In [31]:
print(loss)
print(accuracy)

1.0986130237579346
0.3382442593574524


### On the test set 

In [175]:
loss, accuracy = model_sum_embeddings.evaluate([test_prem_encoded, test_hyp_encoded], test_labels, batch_size=1)




In [176]:
print(loss)
print(accuracy)


31.9547119140625
0.368281751871109
