In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


## Read Data 

In [2]:
def read_data(path, include_label):
    def scan_file():
        with open(path) as in_file:
            next(in_file)  # skip header
            for l in in_file:
                yield l.strip().split('\t')
    if include_label:
        return [(qid, did, q, d, float(label)) for qid, q, _, _, did, d, label in scan_file()]
    else:
        return [(qid, did, q, d) for qid, q, _, _, did, d, _ in scan_file()]

train = read_data('data/WikiQA/WikiQACorpus/WikiQA-train.tsv', include_label=True)
predict  = read_data('data/WikiQA/WikiQACorpus/WikiQA-test.tsv', include_label=False)

## Pack Data

In [3]:
train_pack = mz.pack(train[:1000])
predict_pack = mz.pack(predict[:1000])

## Preprocess Data

In [4]:
preprocessor = mz.preprocessors.DSSMPreprocessor()
preprocessor.fit(train_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 3241.06it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3947.95it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 137942.17it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 293082.52it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2895744.78it/s]


<matchzoo.preprocessors.dssm_preprocessor.DSSMPreprocessor at 0x117e8b780>

In [5]:
train_pack_processed = preprocessor.transform(train_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 96/96 [00:00<00:00, 4425.20it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3074.30it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 108/108 [00:00<00:00, 5678.21it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 992/992 [00:00<00:00, 3256.48it/s]


## Generate Data

In [6]:
train_generator = mz.DataGenerator(train_pack_processed, batch_size=32)

## Model Building

In [7]:
ranking_task = mz.tasks.Ranking()
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
ranking_task.metrics

['mae',
 mean_average_precision(0),
 precision@1(0),
 precision@3(0),
 discounted_cumulative_gain@1(0),
 discounted_cumulative_gain@3(0),
 discounted_cumulative_gain@5(0),
 normalized_discounted_cumulative_gain@1(0),
 normalized_discounted_cumulative_gain@3(0),
 normalized_discounted_cumulative_gain@5(0)]

In [8]:
model = mz.models.DSSMModel()
model.params['task'] = ranking_task
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.guess_and_fill_missing_params()
model.build()
model.compile()

## Model Training

In [9]:
model.fit(*train_pack_processed.unpack(), epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x117e82da0>

In [10]:
model.fit_generator(train_generator, epochs=2, workers=4, use_multiprocessing=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x122210550>

### Training with Dynamic Data Generator

In [11]:
pp_no_hashing = mz.preprocessors.DSSMPreprocessor(with_word_hashing=False)
train_pack_processed_no_hashing = pp_no_hashing.fit_transform(train_pack)
word_hashing_unit = mz.processor_units.WordHashingUnit(pp_no_hashing.context['term_index'])
dynamic_data_generator = mz.data_generator.DynamicDataGenerator(word_hashing_unit.transform,
                                                                train_pack_processed_no_hashing)
model.fit_generator(dynamic_data_generator, epochs=2, workers=4, use_multiprocessing=True)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 4224.23it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3812.05it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 137989.44it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 338059.48it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2660636.10it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 8476.20it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 4042.41it

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x117e9c550>

## Model Predicting 

In [12]:
model.predict(predict_pack_processed[:10].unpack()[0])

array([[-0.82203054],
       [-0.5365823 ],
       [-0.64885724],
       [-0.706661  ],
       [-0.5631074 ],
       [-0.7180826 ],
       [-0.64169425],
       [-0.6807523 ],
       [-0.74259025],
       [-0.4627969 ]], dtype=float32)

## Model Evaluating

In [13]:
model.evaluate(*train_pack_processed.unpack())



{'loss': 0.008125047847628594,
 'mean_absolute_error': 0.06410697138309479,
 'mean_average_precision(0)': 0.3333333333333333,
 'precision@1(0)': 0.3333333333333333,
 'precision@3(0)': 0.13194444444444445,
 'discounted_cumulative_gain@1(0)': 0.4808983469629878,
 'discounted_cumulative_gain@3(0)': 0.5338530547483661,
 'discounted_cumulative_gain@5(0)': 0.5338530547483661,
 'normalized_discounted_cumulative_gain@1(0)': 0.3333333333333333,
 'normalized_discounted_cumulative_gain@3(0)': 0.3333333333333333,
 'normalized_discounted_cumulative_gain@5(0)': 0.3333333333333333}

## Misc

### Load Embedding Matrix

In [14]:
import keras.layers
from matchzoo import engine

class SimpleEmbeddingModel(engine.BaseModel):
    @classmethod
    def get_default_params(cls) -> engine.ParamTable:
        """:return: model default parameters."""
        return super().get_default_params(with_embedding=True)

    def build(self):
        """Model structure."""
        x_in = [keras.layers.Input(name=name, shape=shape)
                for name, shape in zip(['text_left', 'text_right'],
                                       self._params['input_shapes'])]
        x = keras.layers.concatenate(x_in)
        x = keras.layers.embeddings.Embedding(*self._params['embedding_shape'], name='embedding')(x)
        x = keras.layers.Dense(32, activation='relu')(x)
        x_out = self._make_output_layer()(x)
        self._backend = keras.models.Model(inputs=x_in, outputs=x_out)

In [15]:
model = SimpleEmbeddingModel()
model.params['task'] = ranking_task
model.params['embedding_shape'] = (len(preprocessor.context['term_index']) + 1, 300)
model.guess_and_fill_missing_params()
model.build()
model.compile()

In [16]:
vocab = preprocessor.context['term_index']
embedding_matrix = np.random.uniform(-0.2, 0.2, model.params['embedding_shape'])
model.load_embedding_matrix(embedding_matrix)

## Automation

In [17]:
train_pack.shuffle(inplace=True)

In [18]:
divider = int(len(train_pack) * 0.9)
train_pack_divided = train_pack[:divider]
test_pack_divided = train_pack[divider:]

In [19]:
dssm_model_a = mz.models.DSSMModel()
dssm_model_a.params['name'] = 'Model A'
dssm_model_b = mz.models.DSSMModel()
dssm_model_b.params['name'] = 'Model B'

In [20]:
director = mz.Director()
director.params['models'] = [
    dssm_model_a, dssm_model_b
]
director.params['evals_per_model'] = 5
director.params['train_pack'] = train_pack_divided
director.params['test_pack'] = test_pack_divided
director.params['task'] = ranking_task

In [21]:
all_trials = director.action()

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 84/84 [00:00<00:00, 3651.57it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 900/900 [00:00<00:00, 3566.26it/s]
Processing text_left with extend: 100%|██████████| 84/84 [00:00<00:00, 70889.65it/s]
Processing text_right with extend: 100%|██████████| 900/900 [00:00<00:00, 353386.41it/s]
Fitting vocabulary unit.: 100%|██████████| 75505/75505 [00:00<00:00, 2654709.57it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 84/84 [00:00<00:00, 5305.09it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 

Epoch 1/1


tpe_transform took 0.000451 seconds
TPE using 1/1 trials with best loss 0.419784


Epoch 1/1


tpe_transform took 0.000815 seconds
TPE using 2/2 trials with best loss 0.076754


Epoch 1/1


tpe_transform took 0.000540 seconds
TPE using 3/3 trials with best loss 0.076754


Epoch 1/1


tpe_transform took 0.000411 seconds
TPE using 4/4 trials with best loss 0.076754


Epoch 1/1


Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 84/84 [00:00<00:00, 7914.85it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 900/900 [00:00<00:00, 2603.80it/s]
Processing text_left with extend: 100%|██████████| 84/84 [00:00<00:00, 118347.85it/s]
Processing text_right with extend: 100%|██████████| 900/900 [00:00<00:00, 244819.61it/s]
Fitting vocabulary unit.: 100%|██████████| 75505/75505 [00:00<00:00, 2221114.33it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 84/84 [00:00<00:00, 5090.32it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████|

Epoch 1/1


tpe_transform took 0.000458 seconds
TPE using 1/1 trials with best loss 0.068965


Epoch 1/1


tpe_transform took 0.000487 seconds
TPE using 2/2 trials with best loss 0.068965


Epoch 1/1


tpe_transform took 0.000477 seconds
TPE using 3/3 trials with best loss 0.068965


Epoch 1/1


tpe_transform took 0.000478 seconds
TPE using 4/4 trials with best loss 0.068965


Epoch 1/1


In [22]:
for model_trials in all_trials:
    for trial in model_trials:
        print('Model Name:', trial['model_params']['name'])
        print("Hyper Parameter Sampled:" , trial['sampled_params'])
        print('Loss: ', trial['loss'])
        print()

Model Name: Model A
Hyper Parameter Sampled: {'dim_hidden': 484.0}
Loss:  0.41978442668914795

Model Name: Model A
Hyper Parameter Sampled: {'dim_hidden': 467.0}
Loss:  0.07675425708293915

Model Name: Model A
Hyper Parameter Sampled: {'dim_hidden': 317.0}
Loss:  0.2551075220108032

Model Name: Model A
Hyper Parameter Sampled: {'dim_hidden': 136.0}
Loss:  0.17611287534236908

Model Name: Model A
Hyper Parameter Sampled: {'dim_hidden': 319.0}
Loss:  0.056967247277498245

Model Name: Model B
Hyper Parameter Sampled: {'dim_hidden': 398.0}
Loss:  0.06896458566188812

Model Name: Model B
Hyper Parameter Sampled: {'dim_hidden': 428.0}
Loss:  0.4359401762485504

Model Name: Model B
Hyper Parameter Sampled: {'dim_hidden': 428.0}
Loss:  0.07799652218818665

Model Name: Model B
Hyper Parameter Sampled: {'dim_hidden': 487.0}
Loss:  0.2389742136001587

Model Name: Model B
Hyper Parameter Sampled: {'dim_hidden': 96.0}
Loss:  0.211103618144989

