In [1]:
%run ./tutorials/wikiqa/init.ipynb

Using TensorFlow backend.


matchzoo version 2.1.0

data loading ...
data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`
`ranking_task` initialized with metrics [normalized_discounted_cumulative_gain@3(0.0), normalized_discounted_cumulative_gain@5(0.0), mean_average_precision(0.0)]
loading embedding ...
embedding loaded as `glove_embedding`


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.visible_device_list="1"
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras

In [3]:
def load_filtered_data(preprocessor, data_type):
    assert ( data_type in ['train', 'dev', 'test'])
    data_pack = mz.datasets.wiki_qa.load_data(data_type, task='ranking')

    if data_type == 'train':
        X, Y = preprocessor.fit_transform(data_pack).unpack()
    else:
        X, Y = preprocessor.transform(data_pack).unpack()

    new_idx = []
    for i in range(Y.shape[0]):
        if X["length_left"][i] == 0 or X["length_right"][i] == 0:
            continue
        new_idx.append(i)
    new_idx = np.array(new_idx)
    print("Removed empty data. Found ", (Y.shape[0] - new_idx.shape[0]))

    for k in X.keys():
        X[k] = X[k][new_idx]
    Y = Y[new_idx]

    pos_idx = (Y == 1)[:, 0]
    pos_qid = X["id_left"][pos_idx]
    keep_idx_bool = np.array([ qid in pos_qid for qid in X["id_left"]])
    keep_idx = np.arange(keep_idx_bool.shape[0])
    keep_idx = keep_idx[keep_idx_bool]
    print("Removed questions with no pos label. Found ", (keep_idx_bool == 0).sum())

    print("shuffling...")
    np.random.shuffle(keep_idx)
    for k in X.keys():
        X[k] = X[k][keep_idx]
    Y = Y[keep_idx]

    return X, Y, preprocessor

In [4]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=20,
                                                  fixed_length_right=40,
                                                  remove_stop_words=False)
train_X, train_Y, preprocessor = load_filtered_data(preprocessor, 'train')
val_X, val_Y, _ = load_filtered_data(preprocessor, 'dev')
pred_X, pred_Y, _ = load_filtered_data(preprocessor, 'test')

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 12754.26it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 18841/18841 [00:02<00:00, 6500.31it/s]
Processing text_right with append: 100%|██████████| 18841/18841 [00:00<00:00, 1215206.55it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 18841/18841 [00:00<00:00, 185258.28it/s]
Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 184455.70it/s]
Processing text_left with extend: 100%|██████████| 2118/2118 [00:00<00:00, 922581.36it/s]
Processing text_right with extend: 100%|██████████| 18841/18841 [00:00<00:00, 1082236.12it/s]
Building Vocabulary from a datapack.: 100%|██████████| 404432/404432 [00:00<00:00, 3795031.47it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 13650.60it/s]
Processing

Removed empty data. Found  38


Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 296/296 [00:00<00:00, 14135.26it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval:   0%|          | 0/2708 [00:00<?, ?it/s]

Removed questions with no pos label. Found  11672
shuffling...


Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2708/2708 [00:00<00:00, 6731.87it/s]
Processing text_right with transform: 100%|██████████| 2708/2708 [00:00<00:00, 168473.93it/s]
Processing text_left with transform: 100%|██████████| 296/296 [00:00<00:00, 204701.40it/s]
Processing text_right with transform: 100%|██████████| 2708/2708 [00:00<00:00, 159066.95it/s]
Processing length_left with len: 100%|██████████| 296/296 [00:00<00:00, 442607.48it/s]
Processing length_right with len: 100%|██████████| 2708/2708 [00:00<00:00, 1038699.15it/s]
Processing text_left with transform: 100%|██████████| 296/296 [00:00<00:00, 149130.81it/s]
Processing text_right with transform: 100%|██████████| 2708/2708 [00:00<00:00, 140864.36it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 633/633 [00:00<00:00, 12189.39it/s]

Removed empty data. Found  2
Removed questions with no pos label. Found  1601
shuffling...



Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 5961/5961 [00:00<00:00, 7064.16it/s]
Processing text_right with transform: 100%|██████████| 5961/5961 [00:00<00:00, 187399.25it/s]
Processing text_left with transform: 100%|██████████| 633/633 [00:00<00:00, 259733.36it/s]
Processing text_right with transform: 100%|██████████| 5961/5961 [00:00<00:00, 160878.23it/s]
Processing length_left with len: 100%|██████████| 633/633 [00:00<00:00, 688714.51it/s]
Processing length_right with len: 100%|██████████| 5961/5961 [00:00<00:00, 1166965.98it/s]
Processing text_left with transform: 100%|██████████| 633/633 [00:00<00:00, 158526.06it/s]
Processing text_right with transform: 100%|██████████| 5961/5961 [00:00<00:00, 137558.64it/s]


Removed empty data. Found  18
Removed questions with no pos label. Found  3805
shuffling...


In [5]:
from keras.optimizers import Adam
import matchzoo

model = matchzoo.contrib.models.ESIM()

# update `input_shapes` and `embedding_input_dim`
# model.params['task'] = mz.tasks.Ranking() 
# or 
model.params['task'] = mz.tasks.Classification(num_classes=2)
model.params.update(preprocessor.context)

model.params['mask_value'] = 0
model.params['lstm_dim'] = 300
model.params['embedding_output_dim'] = 300
model.params['embedding_trainable'] = False
model.params['dropout_rate'] = 0.5

model.params['mlp_num_units'] = 300
model.params['mlp_num_layers'] = 0
model.params['mlp_num_fan_out'] = 300
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = Adam(lr=1e-4)
model.guess_and_fill_missing_params()
model.build()
model.compile()
model.backend.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 20)           0                                            
__________________________________________________________________________________________________
text_right (InputLayer)         (None, 40)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             5002500     text_left[0][0]                  
                                                                 text_right[0][0]                 
__________________________________________________________________________________________________
dropout_1 (Dropout)             multiple             0           embedding[0][0]                  
          

In [6]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'], initializer=lambda: 0)
model.load_embedding_matrix(embedding_matrix)

In [8]:
# train as ranking task
model.params['task'] = mz.tasks.Ranking()
evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                           x=pred_X,
                                           y=pred_Y,
                                           once_every=1,
                                           batch_size=len(pred_Y))
history = model.fit(x = [train_X['text_left'],
                         train_X['text_right']],                  # (20360, 1000)
                    y = train_Y,                                  # (20360, 2)
                    validation_data = (val_X, val_Y),
                    callbacks=[evaluate],
                    batch_size = 32,
                    epochs = 10)

Train on 8650 samples, validate on 1130 samples
Epoch 1/10
Validation: mean_average_precision(0.0): 0.6377925262180991
Epoch 2/10
Validation: mean_average_precision(0.0): 0.6323746460063332
Epoch 3/10
Validation: mean_average_precision(0.0): 0.6447892278707743
Epoch 4/10
Validation: mean_average_precision(0.0): 0.6645210508066117
Epoch 5/10
Validation: mean_average_precision(0.0): 0.6622282952529867
Epoch 6/10
Validation: mean_average_precision(0.0): 0.6654279587941297
Epoch 7/10
Validation: mean_average_precision(0.0): 0.6668269018575894
Epoch 8/10
Validation: mean_average_precision(0.0): 0.6723704781393599
Epoch 9/10
Validation: mean_average_precision(0.0): 0.6552521148587158
Epoch 10/10
Validation: mean_average_precision(0.0): 0.6695447388956829


In [7]:
# train as classification task 

from keras.utils import to_categorical
train_Y = to_categorical(train_Y)
val_Y = to_categorical(val_Y)

model.params['task'] = mz.tasks.Classification(num_classes=2)

history = model.fit(x = [train_X['text_left'],
                         train_X['text_right']],                  # (20360, 1000)
                    y = train_Y,                                  # (20360, 2)
                    validation_data = (val_X, val_Y),
                    batch_size = 32,
                    epochs = 10)

Train on 8650 samples, validate on 1130 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
