In [1]:
%run init.ipynb

Using TensorFlow backend.


matchzoo version 2.1.0

data loading ...
data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`
`ranking_task` initialized with metrics [normalized_discounted_cumulative_gain@3(0.0), normalized_discounted_cumulative_gain@5(0.0), mean_average_precision(0.0)]
loading embedding ...
Downloading data from http://nlp.stanford.edu/data/glove.6B.zip
embedding loaded as `glove_embedding`


In [2]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, 
                                                  fixed_length_right=100, 
                                                  remove_stop_words=False)

In [3]:
train_pack_processed = preprocessor.fit_transform(train_pack_raw)
dev_pack_processed = preprocessor.transform(dev_pack_raw)
test_pack_processed = preprocessor.transform(test_pack_raw)

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 3830.07it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 18841/18841 [00:04<00:00, 4282.12it/s]
Processing text_right with append: 100%|██████████| 18841/18841 [00:00<00:00, 879432.02it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 18841/18841 [00:00<00:00, 142755.77it/s]
Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 82363.40it/s]
Processing text_left with extend: 100%|██████████| 2118/2118 [00:00<00:00, 600685.37it/s]
Processing text_right with extend: 100%|██████████| 18841/18841 [00:00<00:00, 730399.85it/s]
Building Vocabulary from a datapack.: 100%|██████████| 404432/404432 [00:00<00:00, 2574950.98it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 8353.67it/s]
Processing text

In [4]:
preprocessor.context

{'filter_unit': <matchzoo.preprocessors.units.frequency_filter.FrequencyFilter at 0x7f5c7ea5f400>,
 'vocab_unit': <matchzoo.preprocessors.units.vocabulary.Vocabulary at 0x7f5b82b9b6d8>,
 'vocab_size': 16674,
 'embedding_input_dim': 16674,
 'input_shapes': [(10,), (100,)]}

In [9]:
model = mz.models.ArcII()

model.params['input_shapes'] = preprocessor.context['input_shapes']
model.params['task'] = ranking_task
model.params['embedding_input_dim'] = preprocessor.context['vocab_size']
model.params['embedding_output_dim'] = 100
model.params['embedding_trainable'] = True
model.params['num_blocks'] = 2
model.params['kernel_1d_count'] = 32
model.params['kernel_1d_size'] = 3
model.params['kernel_2d_count'] = [64, 64]
model.params['kernel_2d_size'] = [3, 3]
model.params['pool_2d_size'] = [[3, 3], [3, 3]]
model.params['optimizer'] = 'adam'
model.guess_and_fill_missing_params()

model.build()
model.compile()

In [10]:
model.backend.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 10)           0                                            
__________________________________________________________________________________________________
text_right (InputLayer)         (None, 100)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             1667400     text_left[0][0]                  
                                                                 text_right[0][0]                 
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 10, 32)       9632        embedding[0][0]                  
__________

In [11]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])

In [12]:
model.load_embedding_matrix(embedding_matrix)

In [14]:
test_x, test_y = test_pack_processed[:].unpack()
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=test_x, y=test_y, batch_size=len(test_y))

In [15]:
train_generator = mz.DataGenerator(
    train_pack_processed,
    mode='pair',
    num_dup=2,
    num_neg=1,
    batch_size=20
)
print('num batches:', len(train_generator))

num batches: 102


In [16]:
history = model.fit_generator(train_generator, 
                              epochs=30, 
                              callbacks=[evaluate], 
                              workers=30, 
                              use_multiprocessing=True)

Epoch 1/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5936572551455329 - normalized_discounted_cumulative_gain@5(0.0): 0.6454279442532856 - mean_average_precision(0.0): 0.5983365117067648
Epoch 2/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5790632663839901 - normalized_discounted_cumulative_gain@5(0.0): 0.6456997768354856 - mean_average_precision(0.0): 0.6032073817280738
Epoch 3/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5819878490160546 - normalized_discounted_cumulative_gain@5(0.0): 0.6455971993060833 - mean_average_precision(0.0): 0.5962392151679005
Epoch 4/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5492739287024206 - normalized_discounted_cumulative_gain@5(0.0): 0.6064684958153637 - mean_average_precision(0.0): 0.5575314834832998
Epoch 5/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5793394155322161 - normalized_discounted_cumulative_gain@5(0.0): 0.6372166104056097 - mean_average_precision(

In [None]:
# append_params_to_readme(model)