In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


## Prepare Data 

In [2]:
train_pack = mz.datasets.wiki_qa.load_data(stage='train')[:1000]
dev_pack = mz.datasets.wiki_qa.load_data(stage='dev')[:1000]
predict_pack = mz.datasets.wiki_qa.load_data(stage='test').drop_label()[:1000]

In [3]:
preprocessor = mz.preprocessors.DSSMPreprocessor()
preprocessor.fit(train_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 5011.18it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3607.49it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 108854.61it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 296836.80it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2779480.30it/s]


<matchzoo.preprocessors.dssm_preprocessor.DSSMPreprocessor at 0x119b5c828>

In [4]:
train_pack_processed = preprocessor.transform(train_pack)
dev_pack_processed = preprocessor.transform(dev_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 96/96 [00:00<00:00, 3350.89it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2041.49it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 97/97 [00:00<00:00, 4056.55it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 991/991 [00:00<00:00, 2975.38it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 108/108 [00:00<

In [5]:
train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=5, num_neg=1, batch_size=32)

## Build Model

In [6]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss(num_neg=1, margin=1.0))
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
ranking_task.metrics
ranking_task.loss

<matchzoo.losses.rank_hinge_loss.RankHingeLoss at 0x1247ffef0>

In [7]:
model = mz.models.DSSMModel()
model.params['task'] = ranking_task
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.guess_and_fill_missing_params()
model.build()
model.compile()

Parameter "name" set to DSSMModel.


## Train Model

In [8]:
dev_x, dev_y = dev_pack_processed.unpack()

In [9]:
evaluate = model.EvaluateOnCall(model, x=dev_x, y=dev_y, valid_steps=2, batch_size=32)

In [10]:
model.fit(
    *train_pack_processed.unpack(),
    epochs=10,
    batch_size = 32,
    callbacks=[evaluate]
)

Epoch 1/10


Validation: loss:0.998804 - mean_absolute_error:0.200083 - mean_average_precision(0):0.119550 - precision@1(0):0.072165 - precision@3(0):0.054983 - discounted_cumulative_gain@1(0):0.104112 - discounted_cumulative_gain@3(0):0.174936 - discounted_cumulative_gain@5(0):0.199254 - normalized_discounted_cumulative_gain@1(0):0.072165 - normalized_discounted_cumulative_gain@3(0):0.112758 - normalized_discounted_cumulative_gain@5(0):0.125955


Epoch 2/10
Epoch 3/10


Validation: loss:0.970272 - mean_absolute_error:0.289726 - mean_average_precision(0):0.098011 - precision@1(0):0.051546 - precision@3(0):0.044674 - discounted_cumulative_gain@1(0):0.074366 - discounted_cumulative_gain@3(0):0.139700 - discounted_cumulative_gain@5(0):0.176178 - normalized_discounted_cumulative_gain@1(0):0.051546 - normalized_discounted_cumulative_gain@3(0):0.085379 - normalized_discounted_cumulative_gain@5(0):0.107228


Epoch 4/10
Epoch 5/10


Validation: loss:1.005884 - mean_absolute_error:0.375100 - mean_average_precision(0):0.104454 - precision@1(0):0.051546 - precision@3(0):0.051546 - discounted_cumulative_gain@1(0):0.074366 - discounted_cumulative_gain@3(0):0.158468 - discounted_cumulative_gain@5(0):0.177033 - normalized_discounted_cumulative_gain@1(0):0.051546 - normalized_discounted_cumulative_gain@3(0):0.096613 - normalized_discounted_cumulative_gain@5(0):0.109482


Epoch 6/10
Epoch 7/10


Validation: loss:1.006295 - mean_absolute_error:0.389762 - mean_average_precision(0):0.114078 - precision@1(0):0.061856 - precision@3(0):0.044674 - discounted_cumulative_gain@1(0):0.089239 - discounted_cumulative_gain@3(0):0.151032 - discounted_cumulative_gain@5(0):0.188161 - normalized_discounted_cumulative_gain@1(0):0.061856 - normalized_discounted_cumulative_gain@3(0):0.097963 - normalized_discounted_cumulative_gain@5(0):0.119625


Epoch 8/10
Epoch 9/10


Validation: loss:0.976474 - mean_absolute_error:0.286072 - mean_average_precision(0):0.094837 - precision@1(0):0.041237 - precision@3(0):0.037801 - discounted_cumulative_gain@1(0):0.059493 - discounted_cumulative_gain@3(0):0.121285 - discounted_cumulative_gain@5(0):0.164168 - normalized_discounted_cumulative_gain@1(0):0.041237 - normalized_discounted_cumulative_gain@3(0):0.072629 - normalized_discounted_cumulative_gain@5(0):0.099093


Epoch 10/10


<keras.callbacks.History at 0x119b93dd8>

In [11]:
model.fit_generator(
    train_generator,
    epochs=5,
    callbacks=[evaluate],    
    workers=4, 
    use_multiprocessing=True
)

Epoch 1/5


Validation: loss:1.002492 - mean_absolute_error:0.316171 - mean_average_precision(0):0.119442 - precision@1(0):0.061856 - precision@3(0):0.044674 - discounted_cumulative_gain@1(0):0.089239 - discounted_cumulative_gain@3(0):0.151032 - discounted_cumulative_gain@5(0):0.211828 - normalized_discounted_cumulative_gain@1(0):0.061856 - normalized_discounted_cumulative_gain@3(0):0.099241 - normalized_discounted_cumulative_gain@5(0):0.138121


Epoch 2/5
Epoch 3/5


Validation: loss:0.999009 - mean_absolute_error:0.329531 - mean_average_precision(0):0.124680 - precision@1(0):0.072165 - precision@3(0):0.044674 - discounted_cumulative_gain@1(0):0.104112 - discounted_cumulative_gain@3(0):0.154573 - discounted_cumulative_gain@5(0):0.215370 - normalized_discounted_cumulative_gain@1(0):0.072165 - normalized_discounted_cumulative_gain@3(0):0.105148 - normalized_discounted_cumulative_gain@5(0):0.141672


Epoch 4/5
Epoch 5/5


Validation: loss:0.992476 - mean_absolute_error:0.332337 - mean_average_precision(0):0.122589 - precision@1(0):0.061856 - precision@3(0):0.051546 - discounted_cumulative_gain@1(0):0.089239 - discounted_cumulative_gain@3(0):0.165905 - discounted_cumulative_gain@5(0):0.213890 - normalized_discounted_cumulative_gain@1(0):0.061856 - normalized_discounted_cumulative_gain@3(0):0.109550 - normalized_discounted_cumulative_gain@5(0):0.139551


<keras.callbacks.History at 0x119b93518>

### Training with Dynamic Data Generator

In [12]:
pp_no_hashing = mz.preprocessors.DSSMPreprocessor(with_word_hashing=False)
train_pack_processed_no_hashing = pp_no_hashing.fit_transform(train_pack)
word_hashing_unit = mz.processor_units.WordHashingUnit(pp_no_hashing.context['vocab_unit'].state['term_index'])
dynamic_data_generator = mz.data_generator.DynamicDataGenerator(word_hashing_unit.transform,
                                                                train_pack_processed_no_hashing)
model.fit_generator(dynamic_data_generator, epochs=2, workers=4, use_multiprocessing=True)

Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 6206.70it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 2777.65it/s]
Processing text_left with extend: 100%|██████████| 96/96 [00:00<00:00, 82056.90it/s]
Processing text_right with extend: 100%|██████████| 1000/1000 [00:00<00:00, 373990.55it/s]
Fitting vocabulary unit.: 100%|██████████| 84935/84935 [00:00<00:00, 2402066.05it/s]
Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 96/96 [00:00<00:00, 5845.47it/s]
Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 1000/1000 [00:00<00:00, 3576.16it/

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x124930550>

## Model Predicting 

In [13]:
model.predict(predict_pack_processed[:10].unpack()[0])

array([[0.46259156],
       [0.46258917],
       [0.4625917 ],
       [0.46259177],
       [0.4625912 ],
       [0.46259165],
       [0.46257597],
       [0.46257755],
       [0.46258023],
       [0.46258736]], dtype=float32)

## Model Evaluating

In [14]:
model.evaluate(*train_pack_processed.unpack())



{'loss': 0.9582958526611328,
 'mean_absolute_error': 0.43928235244750974,
 mean_average_precision(0): 0.20924723424723424,
 precision@1(0): 0.15625,
 precision@3(0): 0.09375,
 discounted_cumulative_gain@1(0): 0.22542110013890052,
 discounted_cumulative_gain@3(0): 0.33133051570965705,
 discounted_cumulative_gain@5(0): 0.3617159505927088,
 normalized_discounted_cumulative_gain@1(0): 0.15625,
 normalized_discounted_cumulative_gain@3(0): 0.2047359246651801,
 normalized_discounted_cumulative_gain@5(0): 0.223416573897238}

## Automation (under construction)

In [15]:
# train_pack.shuffle(inplace=True)

# divider = int(len(train_pack) * 0.9)
# train_pack_divided = train_pack[:divider]
# test_pack_divided = train_pack[divider:]

# director = mz.Director()
# director.params['models'] = [
#     mz.models.DenseBaselineModel(),
#     mz.models.DSSMModel(),
#     mz.models.ArcIModel(),
#     mz.models.KNRMModel(),
#     mz.models.ConvKNRMModel(),
#     mz.models.DUETModel()
# ]
# director.params['evals_per_model'] = 3
# director.params['train_pack'] = train_pack_divided
# director.params['test_pack'] = test_pack_divided
# director.params['task'] = ranking_task

# trials = director.action(verbose=1)

# for trial in trials:
#     print('Model Name:', trial['model_params']['name'])
#     print("Hyper Parameter Sampled:" , trial['sampled_params'])
#     print('Loss:', trial['loss'])
#     print()

## Misc

### Load Embedding Matrix

In [16]:
model = mz.models.ArcIModel()
model.params['task'] = ranking_task
term_index = preprocessor.context['vocab_unit'].state['term_index']
model.params['embedding_input_dim'] = len(term_index) + 1
model.guess_and_fill_missing_params()
model.build()
model.compile()

Parameter "name" set to ArcIModel.
Parameter "input_shapes" set to [(30,), (30,)].
Parameter "embedding_trainable" set to True.


In [17]:
vocab = preprocessor.context['vocab_unit'].state['term_index']
embedding_matrix = np.random.uniform(
    -0.2, 0.2,
    (model.params['embedding_input_dim'], model.params['embedding_output_dim'])
)
model.load_embedding_matrix(embedding_matrix)