In [1]:
import keras
import pandas as pd
import numpy as np
import matchzoo as mz

Using TensorFlow backend.


## Read Data 

In [2]:
def read_data(path, include_label):
    def scan_file():
        with open(path) as in_file:
            next(in_file)  # skip header
            for l in in_file:
                yield l.strip().split('\t')
    if include_label:
        return [(qid, did, q, d, float(label)) for qid, q, _, _, did, d, label in scan_file()]
    else:
        return [(qid, did, q, d) for qid, q, _, _, did, d, _ in scan_file()]

train = read_data('data/WikiQA/WikiQACorpus/WikiQA-train.tsv', include_label=True)
predict  = read_data('data/WikiQA/WikiQACorpus/WikiQA-test.tsv', include_label=False)

## Pack Data

In [3]:
train_pack = mz.pack(train[:10000])
predict_pack = mz.pack(predict[:1000])

In [4]:
train_pack.relation['label'] = np.random.randint(low=0, high=2, size=len(train_pack.relation))

In [5]:
train_pack.left.head()

Unnamed: 0_level_0,text_left
id_left,Unnamed: 1_level_1
Q1,how are glacier caves formed?
Q2,How are the directions of the velocity and for...
Q5,how did apollo creed die
Q6,how long is the term for federal judges
Q7,how a beretta model 21 pistols magazines works


In [6]:
train_pack.right.head()

Unnamed: 0_level_0,text_right
id_right,Unnamed: 1_level_1
D1-0,A partly submerged glacier cave on Perito More...
D1-1,The ice facade is approximately 60 m high
D1-2,Ice formations in the Titlis glacier cave
D1-3,A glacier cave is a cave formed within the ice...
D1-4,"Glacier caves are often called ice caves , but..."


In [7]:
train_pack.relation.head()

Unnamed: 0,id_left,id_right,label
0,Q1,D1-0,1
1,Q1,D1-1,0
2,Q1,D1-2,1
3,Q1,D1-3,0
4,Q1,D1-4,0


In [8]:
train_pack.frame[:].head()

Unnamed: 0,id_left,text_left,id_right,text_right,label
0,Q1,how are glacier caves formed?,D1-0,A partly submerged glacier cave on Perito More...,1
1,Q1,how are glacier caves formed?,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1-2,Ice formations in the Titlis glacier cave,1
3,Q1,how are glacier caves formed?,D1-3,A glacier cave is a cave formed within the ice...,0
4,Q1,how are glacier caves formed?,D1-4,"Glacier caves are often called ice caves , but...",0


In [9]:
train_pack.append_text_length(inplace=True)
train_pack.frame[:].head()

Processing `text_left` with len: 100%|██████████| 993/993 [00:00<00:00, 322264.30it/s]
Processing `text_right` with len: 100%|██████████| 9526/9526 [00:00<00:00, 637575.44it/s]


Unnamed: 0,id_left,text_left,length_left,id_right,text_right,length_right,label
0,Q1,how are glacier caves formed?,29,D1-0,A partly submerged glacier cave on Perito More...,58,1
1,Q1,how are glacier caves formed?,29,D1-1,The ice facade is approximately 60 m high,41,0
2,Q1,how are glacier caves formed?,29,D1-2,Ice formations in the Titlis glacier cave,41,1
3,Q1,how are glacier caves formed?,29,D1-3,A glacier cave is a cave formed within the ice...,61,0
4,Q1,how are glacier caves formed?,29,D1-4,"Glacier caves are often called ice caves , but...",128,0


In [10]:
predict_pack.frame[:].head()

Unnamed: 0,id_left,text_left,id_right,text_right
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-0,African immigration to the United States refer...
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-1,The term African in the scope of this article ...
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-2,From the Immigration and Nationality Act of 19...
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-3,African immigrants in the United States come f...
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0-4,"They include people from different national, l..."


## Preprocess Data

In [11]:
preprocessor = mz.preprocessors.DSSMPreprocessor()
preprocessor.fit(train_pack)

Processing `text_left` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 993/993 [00:00<00:00, 6156.77it/s]
Processing `text_right` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|██████████| 9526/9526 [00:02<00:00, 3606.52it/s]
Processing `text_left` with extend: 100%|██████████| 993/993 [00:00<00:00, 443692.75it/s]
Processing `text_right` with extend: 100%|██████████| 9526/9526 [00:00<00:00, 415513.42it/s]
Fitting vocabulary unit.: 100%|██████████| 816312/816312 [00:00<00:00, 2723783.07it/s]


<matchzoo.preprocessors.dssm_preprocessor.DSSMPreprocessor at 0x1260b3710>

In [12]:
train_pack_processed = preprocessor.transform(train_pack)
predict_pack_processed = preprocessor.transform(predict_pack)

Processing `text_left` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 993/993 [00:00<00:00, 5163.97it/s]
Processing `text_right` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 9526/9526 [00:03<00:00, 2690.08it/s]
Processing `text_left` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 108/108 [00:00<00:00, 4127.80it/s]
Processing `text_right` with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|██████████| 992/992 [00:00<00:00, 2453.42it/s]


## Generate Data

In [13]:
train_generator = mz.DataGenerator(train_pack_processed, batch_size=32)

In [14]:
first_batch = train_generator[0]
last_batch = train_generator[-1]
batch_slice = train_generator[2:5]
X, y = first_batch

In [15]:
X

{'id_left': array(['Q545', 'Q569', 'Q494', 'Q772', 'Q1167', 'Q684', 'Q946', 'Q209',
        'Q125', 'Q527', 'Q709', 'Q795', 'Q276', 'Q1189', 'Q918', 'Q1305',
        'Q996', 'Q662', 'Q277', 'Q135', 'Q854', 'Q707', 'Q107', 'Q1061',
        'Q184', 'Q223', 'Q1127', 'Q1331', 'Q30', 'Q1026', 'Q767', 'Q1219'],
       dtype='<U5'), 'text_left': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), 'length_left': array([26, 46, 34, 37, 30, 24, 28, 40, 41, 25, 45, 56, 40, 25, 42, 36, 33,
        44, 58, 37, 31, 29, 46, 38, 74, 37, 29, 23, 41, 19, 38, 19]), 'id_right': array(['D536-1', 'D560-2', 'D487-3', 'D748-1', 'D1121-0', 'D667-9',
        'D914-3', 'D209-1', 'D125-2', 'D519-3', 'D691-1', 'D769-11',
        'D275-14', 'D1139-13', 'D887-3', 'D1249-1', 'D961-15', 'D646-2',
        'D276-1', 'D135-4', '

In [16]:
y

array([1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1])

In [17]:
X, y = batch_slice
y.shape

(96,)

## Model Building

In [18]:
ranking_task = mz.tasks.Ranking()
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
ranking_task.metrics

['mae',
 mean_average_precision(0),
 precision@1(0),
 precision@3(0),
 discounted_cumulative_gain@1(0),
 discounted_cumulative_gain@3(0),
 discounted_cumulative_gain@5(0),
 normalized_discounted_cumulative_gain@1(0),
 normalized_discounted_cumulative_gain@3(0),
 normalized_discounted_cumulative_gain@5(0)]

In [19]:
model = mz.models.DSSMModel()
input_shapes = preprocessor.context['input_shapes']
model.params['input_shapes'] = input_shapes
model.params['task'] = ranking_task
model.guess_and_fill_missing_params()
model.build()
model.compile()

## Model Training

In [20]:
model.fit(*train_pack_processed.unpack(), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x129171240>

In [21]:
model.fit_generator(train_generator, epochs=10, workers=4, use_multiprocessing=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x128ff6c88>

### Training with Upsampled Data Pack

In [23]:
train_pack_upsampled = mz.reorganize_data_pack_pair_wise(train_pack_processed[:20], num_dup=4, num_neg=4)
model.fit(*train_pack_upsampled.unpack(), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x128e39198>

## Model Predicting 

In [None]:
model.predict(predict_pack_processed[:10].unpack()[0])

## Model Evaluating

In [None]:
model.evaluate(*train_pack_processed.unpack())