In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


## Read Data 

In [2]:
def read_data(path, include_label):
    def scan_file():
        with open(path) as in_file:
            next(in_file)  # skip header
            for l in in_file:
                yield l.strip().split('\t')
    if include_label:
        return [(qid, did, q, d, float(label)) for qid, q, _, _, did, d, label in scan_file()]
    else:
        return [(qid, did, q, d) for qid, q, _, _, did, d, _ in scan_file()]

train = read_data('data/WikiQA/WikiQACorpus/WikiQA-train.tsv', include_label=True)
predict  = read_data('data/WikiQA/WikiQACorpus/WikiQA-test.tsv', include_label=False)

## Pack Data

In [3]:
train_pack = mz.pack(train[:1000])
predict_pack = mz.pack(predict[:1000])

In [4]:
train_pack.left.head()

Unnamed: 0_level_0,text_left
id_left,Unnamed: 1_level_1
Q1,how are glacier caves formed?
Q2,How are the directions of the velocity and for...
Q5,how did apollo creed die
Q6,how long is the term for federal judges
Q7,how a beretta model 21 pistols magazines works


In [5]:
train_pack.right.head()

Unnamed: 0_level_0,text_right
id_right,Unnamed: 1_level_1
D1-0,A partly submerged glacier cave on Perito More...
D1-1,The ice facade is approximately 60 m high
D1-2,Ice formations in the Titlis glacier cave
D1-3,A glacier cave is a cave formed within the ice...
D1-4,"Glacier caves are often called ice caves , but..."


In [6]:
train_pack.relation.head()

Unnamed: 0,id_left,id_right,label
0,Q1,D1-0,0.0
1,Q1,D1-1,0.0
2,Q1,D1-2,0.0
3,Q1,D1-3,1.0
4,Q1,D1-4,0.0


In [7]:
train_pack.frame[:].head()

Unnamed: 0,id_left,text_left,id_right,text_right,label
0,Q1,how are glacier caves formed?,D1-0,A partly submerged glacier cave on Perito More...,0.0
1,Q1,how are glacier caves formed?,D1-1,The ice facade is approximately 60 m high,0.0
2,Q1,how are glacier caves formed?,D1-2,Ice formations in the Titlis glacier cave,0.0
3,Q1,how are glacier caves formed?,D1-3,A glacier cave is a cave formed within the ice...,1.0
4,Q1,how are glacier caves formed?,D1-4,"Glacier caves are often called ice caves , but...",0.0


In [8]:
train_pack.append_text_length(inplace=True)
train_pack.frame[:].head()

Processing length_left with len: 100%|██████████| 96/96 [00:00<00:00, 110467.27it/s]
Processing length_right with len: 100%|██████████| 1000/1000 [00:00<00:00, 197230.51it/s]


Unnamed: 0,id_left,text_left,length_left,id_right,text_right,length_right,label
0,Q1,how are glacier caves formed?,29,D1-0,A partly submerged glacier cave on Perito More...,58,0.0
1,Q1,how are glacier caves formed?,29,D1-1,The ice facade is approximately 60 m high,41,0.0
2,Q1,how are glacier caves formed?,29,D1-2,Ice formations in the Titlis glacier cave,41,0.0
3,Q1,how are glacier caves formed?,29,D1-3,A glacier cave is a cave formed within the ice...,61,1.0
4,Q1,how are glacier caves formed?,29,D1-4,"Glacier caves are often called ice caves , but...",128,0.0


In [9]:
predict_pack.frame[:].head()

Unnamed: 0,id_left,text_left,id_right,text_right
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-0,African immigration to the United States refer...
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-1,The term African in the scope of this article ...
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-2,From the Immigration and Nationality Act of 19...
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-3,African immigrants in the United States come f...
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-4,"They include people from different national, l..."


## Preprocess Data

In [10]:
preprocessor = mz.preprocessors.DSSMPreprocessor()
preprocessor.fit(train_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 4095.71it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3584.46it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 90748.97it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 229096.79it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2647762.53it/s]


<matchzoo.preprocessors.dssm_preprocessor.DSSMPreprocessor at 0x116b5cb00>

In [11]:
train_pack_processed = preprocessor.transform(train_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 96/96 [00:00<00:00, 4766.20it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2808.77it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 108/108 [00:00<00:00, 5575.40it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 992/992 [00:00<00:00, 3071.27it/s]


## Generate Data

In [12]:
train_generator = mz.DataGenerator(train_pack_processed, batch_size=32)

In [13]:
first_batch = train_generator[0]
last_batch = train_generator[-1]
batch_slice = train_generator[2:5]
X, y = first_batch

In [14]:
X

{'id_left': array(['Q99', 'Q1', 'Q39', 'Q18', 'Q17', 'Q97', 'Q44', 'Q120', 'Q65',
        'Q125', 'Q62', 'Q107', 'Q83', 'Q46', 'Q67', 'Q19', 'Q90', 'Q50',
        'Q40', 'Q125', 'Q22', 'Q76', 'Q40', 'Q25', 'Q78', 'Q1', 'Q97',
        'Q70', 'Q26', 'Q23', 'Q89', 'Q107'], dtype='<U4'),
 'text_left': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'length_left': array([22, 29, 56, 25, 42, 36, 44, 36, 42, 41, 36, 46, 28, 31, 34, 70, 20,
        38, 42, 41, 51, 32, 42, 31, 59, 29, 36, 34, 22, 35, 29, 46]),
 'id_right': array(['D99-10', 'D1-2', 'D39-8', 'D18-3', 'D17-12', 'D97-19', 'D44-16',
        'D120-11', 'D65-14', 'D125-3', 'D62-2', 'D107-15', 'D83-4',
        'D46-5', 'D67-1', 'D19-10', 'D90-11', 'D50-20', 'D40-20', 'D125-4',
        'D22-4', 'D76-2', 'D40-19', 'D25-10', 'D78-0', 'D1-3',

In [15]:
y

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [16]:
X, y = batch_slice
y.shape

(96,)

## Model Building

In [17]:
ranking_task = mz.tasks.Ranking()
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
ranking_task.metrics

['mae',
 mean_average_precision(0),
 precision@1(0),
 precision@3(0),
 discounted_cumulative_gain@1(0),
 discounted_cumulative_gain@3(0),
 discounted_cumulative_gain@5(0),
 normalized_discounted_cumulative_gain@1(0),
 normalized_discounted_cumulative_gain@3(0),
 normalized_discounted_cumulative_gain@5(0)]

In [18]:
model = mz.models.DSSMModel()
model.params['task'] = ranking_task
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.guess_and_fill_missing_params()
model.build()
model.compile()

## Model Training

In [19]:
model.fit(*train_pack_processed.unpack(), epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1171ff6d8>

In [20]:
model.fit_generator(train_generator, epochs=2, workers=4, use_multiprocessing=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x117204b38>

### Training with Dynamic Data Generator

In [21]:
pp_no_hashing = mz.preprocessors.DSSMPreprocessor(with_word_hashing=False)
train_pack_processed_no_hashing = pp_no_hashing.fit_transform(train_pack)
word_hashing_unit = mz.processor_units.WordHashingUnit(pp_no_hashing.context['term_index'])
dynamic_data_generator = mz.data_generator.DynamicDataGenerator(word_hashing_unit.transform,
                                                                train_pack_processed_no_hashing)
model.fit_generator(dynamic_data_generator, epochs=2, workers=4, use_multiprocessing=True)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 3043.07it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3349.26it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 109060.99it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 385081.16it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2278002.93it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 6783.58it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3445.69it

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x117126da0>

## Model Predicting 

In [22]:
model.predict(predict_pack_processed[:10].unpack()[0])

array([[ 0.1515378 ],
       [ 0.02516037],
       [ 0.0464203 ],
       [ 0.12370427],
       [-0.00847982],
       [ 0.17810191],
       [ 0.17945188],
       [ 0.14769161],
       [ 0.15018097],
       [ 0.15429482]], dtype=float32)

## Model Evaluating

In [23]:
model.evaluate(*train_pack_processed.unpack())



{'loss': 0.014845640614628793,
 'mean_absolute_error': 0.047846145391464236,
 'mean_average_precision(0)': 0.3333333333333333,
 'precision@1(0)': 0.3333333333333333,
 'precision@3(0)': 0.13194444444444445,
 'discounted_cumulative_gain@1(0)': 0.4808983469629878,
 'discounted_cumulative_gain@3(0)': 0.5338530547483661,
 'discounted_cumulative_gain@5(0)': 0.5338530547483661,
 'normalized_discounted_cumulative_gain@1(0)': 0.3333333333333333,
 'normalized_discounted_cumulative_gain@3(0)': 0.3333333333333333,
 'normalized_discounted_cumulative_gain@5(0)': 0.3333333333333333}

## Misc

### Load Embedding Matrix

In [29]:
import keras.layers
from matchzoo import engine

class SimpleEmbeddingModel(engine.BaseModel):
    @classmethod
    def get_default_params(cls) -> engine.ParamTable:
        """:return: model default parameters."""
        return super().get_default_params(with_embedding=True)

    def build(self):
        """Model structure."""
        x_in = [keras.layers.Input(name=name, shape=shape)
                for name, shape in zip(['text_left', 'text_right'],
                                       self._params['input_shapes'])]
        x = keras.layers.concatenate(x_in)
        x = keras.layers.embeddings.Embedding(*self._params['embedding_shape'], name='embedding')(x)
        x = keras.layers.Dense(32, activation='relu')(x)
        x_out = self._make_output_layer()(x)
        self._backend = keras.models.Model(inputs=x_in, outputs=x_out)

In [30]:
model = SimpleEmbeddingModel()
model.params['task'] = ranking_task
model.params['embedding_shape'] = (len(preprocessor.context['term_index']) + 1, 300)
model.guess_and_fill_missing_params()
model.build()
model.compile()

In [31]:
vocab = preprocessor.context['term_index']
embedding_matrix = np.random.uniform(-0.2, 0.2, model.params['embedding_shape'])
model.load_embedding_matrix(embedding_matrix)