# Quora question pairs: training a model with attention

## Import packages

In [None]:
!nvidia-smi

Sat Jun 27 12:48:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json
from keras.models import Model
from keras.layers import Input, Bidirectional, LSTM, dot, Flatten, Dense, Reshape, add, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Initialize global variables

In [None]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MODEL_WEIGHTS_FILE = '/content/question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
WORD_EMBEDDING_DIM = 300
SENT_EMBEDDING_DIM = 128
VALIDATION_SPLIT = 0.2
# TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
DROPOUT = 0.2
# BATCH_SIZE = 516
BATCH_SIZE = 2048

## Load the dataset, embedding matrix and word count

In [None]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

## Partition the dataset into train and test sets

In [None]:
X_train = np.stack((q1_data, q2_data), axis=1)
y_train = labels
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
# Q1_test = X_test[:,0]
# Q2_test = X_test[:,1]


In [None]:
q1_data_dev = np.load(open("/content/q1_dev.npy", 'rb'))
q2_data_dev = np.load(open("/content/q2_dev.npy", 'rb'))
labels_dev = np.load(open("/content/label_dev.npy", 'rb'))
word_embedding_matrix_dev = np.load(open("/content/word_embedding_matrix_dev.npy", 'rb'))
with open("/content/nb_words_dev.json", 'r') as f:
    nb_words_dev = json.load(f)['nb_words']

X_dev = np.stack((q1_data_dev, q2_data_dev), axis=1)
y_dev = labels_dev
Q1_dev = X_dev[:,0]
Q2_dev = X_dev[:,1]

In [None]:
all_word_embeddings = np.vstack((word_embedding_matrix, word_embedding_matrix_dev))
all_nb_words = nb_words_dev + nb_words

## Define the model

In [None]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(all_nb_words + 2, 
                 WORD_EMBEDDING_DIM, 
                 weights=[all_word_embeddings], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = Bidirectional(LSTM(SENT_EMBEDDING_DIM, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(all_nb_words + 2, 
                 WORD_EMBEDDING_DIM, 
                 weights=[all_word_embeddings], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = Bidirectional(LSTM(SENT_EMBEDDING_DIM, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((MAX_SEQUENCE_LENGTH*SENT_EMBEDDING_DIM))(attention)
attention = Reshape((MAX_SEQUENCE_LENGTH, SENT_EMBEDDING_DIM))(attention)

merged = add([q1,attention])
merged = Flatten()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 25, 300)      34107900    input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 25, 300)      34107900    input_4[0][0]                    
____________________________________________________________________________________________

## Train the model, checkpointing weights with best validation accuracy

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_accuracy', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2020-06-27 16:26:36.554302
Train on 290541 samples, validate on 72636 samples
Epoch 1/25
 - 40s - loss: 0.5798 - accuracy: 0.6939 - val_loss: 0.5108 - val_accuracy: 0.7381
Epoch 2/25
 - 39s - loss: 0.4668 - accuracy: 0.7673 - val_loss: 0.4598 - val_accuracy: 0.7715
Epoch 3/25
 - 39s - loss: 0.4176 - accuracy: 0.7964 - val_loss: 0.4425 - val_accuracy: 0.7812
Epoch 4/25
 - 38s - loss: 0.3773 - accuracy: 0.8207 - val_loss: 0.4221 - val_accuracy: 0.8023
Epoch 5/25
 - 38s - loss: 0.3378 - accuracy: 0.8434 - val_loss: 0.4076 - val_accuracy: 0.8054
Epoch 6/25
 - 38s - loss: 0.2988 - accuracy: 0.8654 - val_loss: 0.4230 - val_accuracy: 0.8131
Epoch 7/25
 - 38s - loss: 0.2605 - accuracy: 0.8852 - val_loss: 0.4434 - val_accuracy: 0.8126
Epoch 8/25
 - 38s - loss: 0.2240 - accuracy: 0.9047 - val_loss: 0.4983 - val_accuracy: 0.8158
Epoch 9/25
 - 38s - loss: 0.1914 - accuracy: 0.9201 - val_loss: 0.5104 - val_accuracy: 0.8145
Epoch 10/25
 - 38s - loss: 0.1615 - accuracy: 0.9341 - 

## Plot training and validation accuracy

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['accuracy'],
                    'validation': history.history['val_accuracy']})
# ax = acc.iloc[:,:].plot(x='epoch', figsize={5,8}, grid=True)
# ax.set_ylabel("accuracy")
# ax.set_ylim([0.0,1.0]);

## Print best validation accuracy and epoch

In [None]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_accuracy']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 19 = 0.8224


## Evaluate the model with best validation accuracy on the test partition

In [None]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_dev, Q2_dev], y_dev, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 1.8867, accuracy = 0.6750


In [None]:
!cat question_pairs_weights.h5

cat: question_pairs_weights.h5: No such file or directory


In [None]:
for key in history.history:
    print(key)

val_loss
val_accuracy
loss
accuracy
