In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


## Read Data 

In [2]:
def read_data(path, include_label):
    def scan_file():
        with open(path) as in_file:
            next(in_file)  # skip header
            for l in in_file:
                yield l.strip().split('\t')
    if include_label:
        return [(qid, did, q, d, float(label)) for qid, q, _, _, did, d, label in scan_file()]
    else:
        return [(qid, did, q, d) for qid, q, _, _, did, d, _ in scan_file()]

train = read_data('data/WikiQA/WikiQACorpus/WikiQA-train.tsv', include_label=True)
predict  = read_data('data/WikiQA/WikiQACorpus/WikiQA-test.tsv', include_label=False)

## Preprocess Data

In [3]:
train_pack = mz.pack(train[:1000])
valid_pack = mz.pack(train[1000:2000])
predict_pack = mz.pack(predict[:1000])

In [4]:
preprocessor = mz.preprocessors.DSSMPreprocessor()
preprocessor.fit(train_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 3605.06it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2858.04it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 97565.59it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 208817.29it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 1775844.12it/s]


<matchzoo.preprocessors.dssm_preprocessor.DSSMPreprocessor at 0x121d1d748>

In [5]:
train_pack_processed = preprocessor.transform(train_pack)
valid_pack_processed = preprocessor.transform(valid_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 96/96 [00:00<00:00, 3866.16it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2274.07it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 101/101 [00:00<00:00, 4319.75it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 988/988 [00:00<00:00, 2219.14it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 108/108 [00:0

## Generate Data

In [6]:
train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=5, num_neg=1, batch_size=32)

## Model Building

In [7]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss(num_neg=1, margin=1.0))
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
ranking_task.metrics
ranking_task.loss

<matchzoo.losses.rank_hinge_loss.RankHingeLoss at 0x1229bfa58>

In [8]:
model = mz.models.DSSMModel()
model.params['task'] = ranking_task
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.guess_and_fill_missing_params()
model.build()
model.compile()

Parameter "name" set to DSSMModel.


## Model Training

In [9]:
val_x, val_y = valid_pack_processed.unpack()

In [10]:
evaluate = model.EvaluateOnCall(x=val_x, y=val_y, valid_steps=3, metrics=ranking_task.metrics, batch_size=32)

In [11]:
model.fit(*train_pack_processed.unpack(),
          epochs=10, 
          batch_size = 32,
          callbacks=[evaluate])

Epoch 1/10
Validation: loss:1.011835 - mean_absolute_error:1.426251 - mean_average_precision(0):0.110806 - precision@1(0):0.019802 - precision@3(0):0.052805 - discounted_cumulative_gain@1(0):0.028568 - discounted_cumulative_gain@3(0):0.139778 - discounted_cumulative_gain@5(0):0.209218 - normalized_discounted_cumulative_gain@1(0):0.019802 - normalized_discounted_cumulative_gain@3(0):0.079349 - normalized_discounted_cumulative_gain@5(0):0.115826
Epoch 2/10
Epoch 3/10
Epoch 4/10
Validation: loss:0.999912 - mean_absolute_error:1.631917 - mean_average_precision(0):0.150054 - precision@1(0):0.079208 - precision@3(0):0.079208 - discounted_cumulative_gain@1(0):0.114273 - discounted_cumulative_gain@3(0):0.245378 - discounted_cumulative_gain@5(0):0.303766 - normalized_discounted_cumulative_gain@1(0):0.079208 - normalized_discounted_cumulative_gain@3(0):0.134400 - normalized_discounted_cumulative_gain@5(0):0.163959
Epoch 5/10
Epoch 6/10
Epoch 7/10
Validation: loss:1.000051 - mean_absolute_error:1

<keras.callbacks.History at 0x1230e74e0>

In [12]:
model.fit_generator(train_generator,
                    epochs=5,
                    callbacks=[evaluate],    
                    workers=4, 
                    use_multiprocessing=True)

Epoch 1/5
Validation: loss:1.000014 - mean_absolute_error:1.640395 - mean_average_precision(0):0.155893 - precision@1(0):0.089109 - precision@3(0):0.085809 - discounted_cumulative_gain@1(0):0.128557 - discounted_cumulative_gain@3(0):0.263063 - discounted_cumulative_gain@5(0):0.309774 - normalized_discounted_cumulative_gain@1(0):0.089109 - normalized_discounted_cumulative_gain@3(0):0.138787 - normalized_discounted_cumulative_gain@5(0):0.167775
Epoch 2/5
Epoch 3/5
Epoch 4/5
Validation: loss:0.999956 - mean_absolute_error:1.610086 - mean_average_precision(0):0.158533 - precision@1(0):0.099010 - precision@3(0):0.082508 - discounted_cumulative_gain@1(0):0.142841 - discounted_cumulative_gain@3(0):0.263063 - discounted_cumulative_gain@5(0):0.316552 - normalized_discounted_cumulative_gain@1(0):0.099010 - normalized_discounted_cumulative_gain@3(0):0.142804 - normalized_discounted_cumulative_gain@5(0):0.171310
Epoch 5/5


<keras.callbacks.History at 0x1230e9eb8>

### Training with Dynamic Data Generator

In [14]:
pp_no_hashing = mz.preprocessors.DSSMPreprocessor(with_word_hashing=False)
train_pack_processed_no_hashing = pp_no_hashing.fit_transform(train_pack)
word_hashing_unit = mz.processor_units.WordHashingUnit(pp_no_hashing.context['vocab_unit'].state['term_index'])
dynamic_data_generator = mz.data_generator.DynamicDataGenerator(word_hashing_unit.transform,
                                                                train_pack_processed_no_hashing)
model.fit_generator(dynamic_data_generator, epochs=2, workers=4, use_multiprocessing=True)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 5329.13it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2799.44it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 108942.96it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 309200.44it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 1691129.58it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 5307.36it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2840.66it

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11fecee48>

## Model Predicting 

In [15]:
model.predict(predict_pack_processed[:10].unpack()[0])

array([[-0.46837702],
       [-0.4721158 ],
       [-0.4679667 ],
       [-0.471246  ],
       [-0.47275394],
       [-0.46788743],
       [ 0.470007  ],
       [ 0.4704158 ],
       [ 0.47078598],
       [ 0.46921322]], dtype=float32)

## Model Evaluating

In [16]:
model.evaluate(*train_pack_processed.unpack())



{'loss': 1.0000682692527771,
 'mean_absolute_error': 0.4665798506736755,
 'mean_average_precision(0)': 0.16003573972323973,
 'precision@1(0)': 0.10416666666666667,
 'precision@3(0)': 0.06944444444444443,
 'discounted_cumulative_gain@1(0)': 0.15028073342593368,
 'discounted_cumulative_gain@3(0)': 0.23722683177529777,
 'discounted_cumulative_gain@5(0)': 0.27474309210176595,
 'normalized_discounted_cumulative_gain@1(0)': 0.10416666666666667,
 'normalized_discounted_cumulative_gain@3(0)': 0.14227239476773954,
 'normalized_discounted_cumulative_gain@5(0)': 0.16827668483818384}

## Automation

In [17]:
train_pack.shuffle(inplace=True)

In [18]:
divider = int(len(train_pack) * 0.9)
train_pack_divided = train_pack[:divider]
test_pack_divided = train_pack[divider:]

In [19]:
director = mz.Director()
director.params['models'] = [
    mz.models.DenseBaselineModel(),
    mz.models.DSSMModel(),
    mz.models.ArcIModel(),
    mz.models.KNRMModel(),
    mz.models.ConvKNRMModel(),
    mz.models.DUETModel()
]
director.params['evals_per_model'] = 3
director.params['train_pack'] = train_pack_divided
director.params['test_pack'] = test_pack_divided
director.params['task'] = ranking_task

In [20]:
trials = director.action(verbose=1)

Start tunning model #1 (total: 6).
Model class: <class 'matchzoo.models.dense_baseline_model.DenseBaselineModel'>
Finish tuning model #1 (total: 6)

Start tunning model #2 (total: 6).
Model class: <class 'matchzoo.models.dssm_model.DSSMModel'>
Finish tuning model #2 (total: 6)

Start tunning model #3 (total: 6).
Model class: <class 'matchzoo.models.arci_model.ArcIModel'>
Finish tuning model #3 (total: 6)

Start tunning model #4 (total: 6).
Model class: <class 'matchzoo.models.knrm_model.KNRMModel'>
Finish tuning model #4 (total: 6)

Start tunning model #5 (total: 6).
Model class: <class 'matchzoo.models.conv_knrm_model.ConvKNRMModel'>
Finish tuning model #5 (total: 6)

Start tunning model #6 (total: 6).
Model class: <class 'matchzoo.models.duet_model.DUETModel'>
Finish tuning model #6 (total: 6)



In [21]:
for trial in trials:
    print('Model Name:', trial['model_params']['name'])
    print("Hyper Parameter Sampled:" , trial['sampled_params'])
    print('Loss:', trial['loss'])
    print()

Model Name: DenseBaselineModel
Hyper Parameter Sampled: {'num_dense_layers': 4.0, 'num_dense_units': 885.0}
Loss: 62.716712951660156

Model Name: DenseBaselineModel
Hyper Parameter Sampled: {'num_dense_layers': 2.0, 'num_dense_units': 427.0}
Loss: 137.76612854003906

Model Name: DenseBaselineModel
Hyper Parameter Sampled: {'num_dense_layers': 4.0, 'num_dense_units': 858.0}
Loss: 56.304771423339844

Model Name: DSSMModel
Hyper Parameter Sampled: {'dim_hidden': 378.0}
Loss: 1.0136383771896362

Model Name: DSSMModel
Hyper Parameter Sampled: {'dim_hidden': 93.0}
Loss: 1.0372545719146729

Model Name: DSSMModel
Hyper Parameter Sampled: {'dim_hidden': 285.0}
Loss: 1.0003693103790283

Model Name: ArcIModel
Hyper Parameter Sampled: {'dropout_rate': 0.38, 'optimizer': 'rmsprop', 'padding': 'same'}
Loss: 0.9797305464744568

Model Name: ArcIModel
Hyper Parameter Sampled: {'dropout_rate': 0.1, 'optimizer': 'adam', 'padding': 'valid'}
Loss: 0.9923397898674011

Model Name: ArcIModel
Hyper Parameter S

## Misc

### Load Embedding Matrix

In [22]:
model = mz.models.ArcIModel()
model.params['task'] = ranking_task
term_index = preprocessor.context['vocab_unit'].state['term_index']
model.params['embedding_input_dim'] = len(term_index) + 1
model.guess_and_fill_missing_params()
model.build()
model.compile()

Parameter "name" set to ArcIModel.
Parameter "input_shapes" set to [(30,), (30,)].
Parameter "embedding_trainable" set to True.


In [23]:
vocab = preprocessor.context['vocab_unit'].state['term_index']
embedding_matrix = np.random.uniform(
    -0.2, 0.2,
    (model.params['embedding_input_dim'], model.params['embedding_output_dim'])
)
model.load_embedding_matrix(embedding_matrix)