In [None]:
import pandas as pd
import numpy as np
from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from imports import load_data, model

In [None]:
# fix random seed for reproducibility
seed = 23
np.random.seed(seed)

Firstload in the Data

In [None]:
df = load_data.get_snli()
df

Now we split into testing, training and validation(dev) data

In [None]:
train,test = load_data.split_df(df)
train,dev = load_data.split_df(train, 0.7)

In [None]:
max_words = 20000 #number of words to act as features
maxlen = 20 #maxium length of a sentence
embedding_dim = 100

Now we must tokenize the sentences

In [None]:
tokenizer = load_data.get_tokenizer([train, dev, test], max_words)

In [None]:
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

In [None]:
(train_x1, train_x2), (dev_x1, dev_x2), (test_x1, test_x2)=tuple(map(
                    lambda x: load_data.tokenize_df(x, tokenizer, maxlen), [train, dev, test]))

In [None]:
train_x1.shape

In [None]:
test_x1.shape

Now lets setup the network, since we are making a siamese network lets setup the base layer that is shared first

In [None]:
from keras.layers import Dense
from keras.models import Model
from keras import layers
import keras

`model.build_concate_input` builds a siamese network where the the shared base layer learns word emdeddings and uses an LSTM layer

In [None]:
concatenated, in1, in2 = model.build_concate_input(max_words, embedding_dim, maxlen)

predictions = Dense(1, activation='sigmoid')(concatenated)

model_learn_embed = Model(inputs=[in1, in2], outputs=predictions)

Since we are predicting in 2 classes `(0,1)` use binary corss entropy as the loss function

In [None]:
model_learn_embed.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model_learn_embed.summary()

Add callbacks to stop the training when the validation accuracy gets worse save only the best model

In [None]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='acc',
        patience=1,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath='SNLI_best_model_learned_embed.h5',
        monitor='acc',
        save_best_only=True,
    )
]

In [None]:
history = model_learn_embed.fit([train_x1, train_x2], train['isSim'], epochs=100, callbacks=callbacks_list,
          validation_data=([dev_x1, dev_x2], dev['isSim']))

In [None]:
model_learn_embed.evaluate(x=[test_x1, test_x2], y=test['isSim'])

In [None]:
model_learn_embed.metrics_names

Now lets try using pretrained word embeddings

In [None]:
embedding_matrix = load_data.get_embedding_matrix(word_index, max_words, embedding_dim)

In [None]:
model_pretrain, callbacks_list = model.build_snli(max_words, embedding_dim, maxlen, embedding_matrix, 'SNLI_best_model_pretrain_embed.h5')

In [None]:
history_pre = model_pretrain.fit([train_x1, train_x2], train['isSim'], epochs=100, callbacks=callbacks_list,
          validation_data=([dev_x1, dev_x2], dev['isSim']))