In [38]:
import numpy as np
import pickle
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Input, Flatten, Dropout, LSTM, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, merge
from keras.models import Model
import keras
from utilities import my_callbacks
from utilities.data_helper import compute_recall_ks, str2bool

In [32]:
emb_dim = 100
hidden_size = 300
batch_size = 256
n_epochs = 50
lr = 0.001
optimizer = 'adam'
n_recurrent_layers = 1
input_dir = './dataset/'
save_model = True
model_fname = 'model/dual_encoder_lstm_classifier.h5'
embedding_file = 'embeddings/glove.6B.100d.txt'
seed = 1337
np.random.seed(seed)

In [15]:
embeddings_index = {}
f = open(embedding_file, 'r')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except ValueError:
        continue
    embeddings_index[word] = coefs
f.close()

In [33]:
MAX_SEQUENCE_LENGTH, MAX_NB_WORDS, word_index = pickle.load(open(input_dir + 'params.pkl', 'rb'))

In [34]:
num_words = min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words , emb_dim))

In [35]:
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [52]:
encoder = Sequential()
encoder.add(Embedding(output_dim=emb_dim,
                      input_dim=num_words,
                      input_length=MAX_SEQUENCE_LENGTH,
                      weights=[embedding_matrix],
                      mask_zero=True,trainable=True))
encoder.add(LSTM(units=hidden_size))

context_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
response_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

context_branch = encoder(context_input)
response_branch = encoder(response_input)

predict_branch = Dense((300),use_bias=False) (context_branch)
concatenated = keras.layers.dot([predict_branch, response_branch], axes = 1)
out = Dense((1), activation = "sigmoid") (concatenated)

dual_encoder = Model([context_input, response_input], out)
dual_encoder.compile(loss='binary_crossentropy',
                     optimizer=optimizer)

In [53]:
print(dual_encoder.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 160)          0                                            
__________________________________________________________________________________________________
sequential_10 (Sequential)      (None, 300)          48690100    input_19[0][0]                   
                                                                 input_20[0][0]                   
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 160)          0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 300)          90000       sequential_10[1][0]              
__________

In [55]:
train_c, train_r, train_l = pickle.load(open(input_dir + 'train.pkl', 'rb'))
test_c, test_r, test_l = pickle.load(open(input_dir + 'test.pkl', 'rb'))
dev_c, dev_r, dev_l = pickle.load(open(input_dir + 'dev.pkl', 'rb'))

print('Found %s training samples.' % len(train_c))
print('Found %s dev samples.' % len(dev_c))
print('Found %s test samples.' % len(test_c))

Found 1000000 training samples.
Found 195600 dev samples.
Found 189200 test samples.


In [56]:
histories = my_callbacks.Histories()

bestAcc = 0.0
patience = 0 

print("\tbatch_size={}, nb_epoch={}".format(batch_size, n_epochs))

for ep in range(1, n_epochs):

    dual_encoder.fit([train_c, train_r], train_l,
            batch_size=batch_size, epochs=1, callbacks=[histories],
            validation_data=([dev_c, dev_r], dev_l), verbose=1)

    curAcc =  histories.accs[0]
    if curAcc >= bestAcc:
        bestAcc = curAcc
        patience = 0
    else:
        patience = patience + 1

    # classify the test set
    y_pred = dual_encoder.predict([test_c, test_r])          

    print("Perform on test set after Epoch: " + str(ep) + "...!")    
    recall_k = compute_recall_ks(y_pred[:,0])

    # stop training the model when patience = 10
    if patience > 10:
        print("Early stopping at epoch: "+ str(ep))
        break

if save_model:
    print("Now saving the model... at {}".format(model_fname))
    dual_encoder.save(model_fname)

	batch_size=256, nb_epoch=50
Train on 1000000 samples, validate on 195600 samples
Epoch 1/1
    512/1000000 [..............................] - ETA: 16:32:50 - loss: 0.6873

KeyboardInterrupt: 