In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


In [2]:
def read_data(path, include_label):
    def scan_file():
        with open(path) as in_file:
            next(in_file)  # skip header
            for l in in_file:
                yield l.strip().split('\t')
    if include_label:
        return [(qid, did, q, d, float(label)) for qid, q, _, _, did, d, label in scan_file()]
    else:
        return [(qid, did, q, d) for qid, q, _, _, did, d, _ in scan_file()]

In [3]:
train = read_data('../../../data/WikiQA/WikiQACorpus/WikiQA-train.tsv', include_label=True)
predict  = read_data('../../../data/WikiQA/WikiQACorpus/WikiQA-test.tsv', include_label=False)

In [4]:
train_pack = mz.pack(train[:1000])
predict_pack = mz.pack(predict[:1000])

In [5]:
preprocessor = mz.preprocessors.DRMMTKSPreprocessor()
preprocessor.fit(train_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit: 100%|██████████| 96/96 [00:00<00:00, 3641.88it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3401.25it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 118671.73it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 289601.88it/s]
Fitting vocabulary unit.: 100%|██████████| 13287/13287 [00:00<00:00, 1805069.55it/s]


<matchzoo.preprocessors.drmmtks_preprocessor.DRMMTKSPreprocessor at 0x11a753668>

In [6]:
train_pack_processed = preprocessor.transform(train_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => VocabularyUnit: 100%|██████████| 96/96 [00:00<00:00, 5662.08it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => VocabularyUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3284.74it/s]
Processing text_left with chain_transform of FixedLengthUnit: 100%|██████████| 96/96 [00:00<00:00, 61230.72it/s]
Processing text_right with chain_transform of FixedLengthUnit: 100%|██████████| 1000/1000 [00:00<00:00, 73238.65it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => VocabularyUnit: 100%|██████████| 108/108 [00:00<00:00, 5857.81it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => VocabularyUnit: 100%|██████████| 992/992 [00:00<00:00, 3367.20it/s]
Processing text_left with chain

In [7]:
train_generator = mz.DataGenerator(train_pack_processed, batch_size=32)

In [8]:
first_batch = train_generator[0]
last_batch = train_generator[-1]
batch_slice = train_generator[2:5]
X, y = first_batch
X

{'id_left': array(['Q122', 'Q40', 'Q70', 'Q90', 'Q44', 'Q19', 'Q16', 'Q106', 'Q133',
        'Q78', 'Q133', 'Q62', 'Q32', 'Q58', 'Q41', 'Q40', 'Q58', 'Q118',
        'Q74', 'Q26', 'Q116', 'Q122', 'Q15', 'Q58', 'Q108', 'Q76', 'Q111',
        'Q77', 'Q28', 'Q29', 'Q58', 'Q41'], dtype='<U4'),
 'text_left': array([[   0,    0,    0,    0,    0,    0,    0,    0, 1940, 3766],
        [   0,    0,    0,    0,    0,  600, 4132, 3135,  701, 2812],
        [   0,    0,    0,    0,    0,    0,  292,  279, 1345,  753],
        [   0,    0,    0,    0,    0,    0,    0,    0, 3052, 2782],
        [   0,    0,    0,    0,    0,    0, 4521, 4223, 4590, 4988],
        [   0,    0,  292, 4381, 2461, 3067, 4296, 3508, 2603, 1296],
        [   0,    0,    0,    0,    0,    0, 3954, 3908, 4295, 1088],
        [   0,    0,    0,    0, 4538, 4490, 4766, 4580, 2521, 2173],
        [   0,    0,    0,    0,    0,    0,    0,    0, 3487, 4046],
        [   0,    0,    0,  292,  820, 2362, 3771, 4729,  292,  17

In [9]:
train_pack[0].unpack()

({'id_left': array(['Q1'], dtype='<U2'),
  'text_left': array(['how are glacier caves formed?'], dtype='<U29'),
  'id_right': array(['D1-0'], dtype='<U4'),
  'text_right': array(['A partly submerged glacier cave on Perito Moreno Glacier .'],
        dtype='<U58')},
 array([0.]))

In [10]:
train_pack_processed[0].unpack()

({'id_left': array(['Q1'], dtype='<U2'),
  'text_left': array([[   0,    0,    0,    0,    0,    0,    0,  228,  748, 2221]]),
  'id_right': array(['D1-0'], dtype='<U4'),
  'text_right': array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,  981, 1696,  228, 2969, 4555, 2195,  228]])},
 array([0.]))

In [11]:
ranking_task = mz.tasks.Ranking()
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]

In [12]:
model = mz.models.DRMMTKSModel()
input_shapes = preprocessor.context['input_shapes']
vocab_size = preprocessor.context['vocab_size']
model.params['input_shapes'] = input_shapes
model.params['task'] = ranking_task
model.params['vocab_size'] = vocab_size
model.guess_and_fill_missing_params()
model.build()
model.compile()

[Layer]: input_left	[Shape]: [None, 10]

[Layer]: input_right	[Shape]: [None, 20]

[Layer]: embed_left	[Shape]: [None, 10, 300]

[Layer]: embed_right	[Shape]: [None, 20, 300]

[Layer]: attention_probs	[Shape]: [None, 10, 1]

[Layer]: matching_matrix	[Shape]: [None, 10, 20]

[Layer]: matching_topk	[Shape]: [None, 10, 10]

[Layer]: dense_output	[Shape]: [None, 10, 1]

[Layer]: dot_score	[Shape]: [None, 1, 1]

[Layer]: flatten_score	[Shape]: [None, None]



In [13]:
model.fit(*train_pack_processed.unpack(), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11b7a3860>

In [14]:
model.predict(predict_pack_processed[:10].unpack()[0])

array([[0.02800608],
       [0.02721103],
       [0.0359645 ],
       [0.02861269],
       [0.02773894],
       [0.04422832],
       [0.02614155],
       [0.02602078],
       [0.0238413 ],
       [0.0248255 ]], dtype=float32)

In [15]:
model.evaluate(*train_pack_processed.unpack())



{'loss': 0.03668390703201294,
 'mean_absolute_error': 0.07313306993246078,
 'mean_average_precision(0)': 0.14736887290341238,
 'precision@1(0)': 0.08333333333333333,
 'precision@3(0)': 0.06249999999999999,
 'normalized_discounted_cumulative_gain@1(0)': 0.08333333333333333,
 'normalized_discounted_cumulative_gain@3(0)': 0.12877154152851175,
 'normalized_discounted_cumulative_gain@5(0)': 0.16426982801736845}