In [1]:
import keras
import numpy as np
import matchzoo as mz
import pickle
import simplejson as json
from time import time
import collections
import sys
from matchzoo.preprocessors import DSSMPreprocessor
from matchzoo.processor_units import WordHashingUnit
from matchzoo.data_generator import DynamicDataGenerator
from matchzoo import chain_transform

Using TensorFlow backend.
Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [2]:
class DiskDataGenerator(keras.utils.Sequence):
    def __init__(self, path, term_index, no_of_batches, start=0):
        self.path = path
        self.term_index = term_index
        self.hashing_unit = WordHashingUnit(term_index)
        self.term_len = len(term_index)
        self.batch_size = 100
        self.no_of_batches = no_of_batches
        self.start = start

    def __len__(self):
        return int(self.no_of_batches)
    
    def __data_generation(self, filename):
        with open(self.path+filename, 'r', encoding='utf-8') as json_file:
            data_blob = json.load(json_file)
        
        id_left = np.array(data_blob['id_left'])
        id_right = np.array(data_blob['id_right'])
        labels = np.array(data_blob['labels'])
        text_left = np.empty((len(id_left), self.term_len+1))
        text_right = np.empty((len(id_right), self.term_len+1))
        
        for i in range(len(id_left)):
            vec = np.empty(self.term_len+1)
            counted_tri_letters = collections.Counter(data_blob['text_left'][i])
            for k, v in counted_tri_letters.items():
                pos = self.term_index.get(k, 0)
                vec[pos] = v
            self.test = counted_tri_letters.items()
            text_left[i] = vec
        
        for i in range(len(id_right)):
            vec = np.zeros(self.term_len+1)
            counted_tri_letters = collections.Counter(data_blob['text_right'][i])
            for k, v in counted_tri_letters.items():
                pos = self.term_index.get(k, 0)
                vec[pos] = v
            text_right[i] = vec
        
        X = {
            'id_left': id_left,
            'id_right': id_right,
            'text_left': text_left,
            'text_right': text_right
        }
        return X, labels

    def __getitem__(self, index):
        if type(index) is slice:
            X = list()
            Y = list()
            
            start = index.start if index.start else 0
            stop = index.stop if index.stop else self.__len__()
            step = index.step if index.step else 1
            
            for i in range(start, stop, step):
                x, y = self.__getitem__(i)
                X.append(x)
                Y.append(y)
            
            return X, Y
                
        index += self.start
        filename = 'batch'+str(index)+'.json'
        return self.__data_generation(filename)
    
    def unpack(self, start=None, stop=None, step=None):
        X, Y = self[start:stop:step]
        
        all_X = X[0]
        all_y = Y[0]
        i = 1
        while i < len(X):
            all_X['id_left'] = np.hstack((all_X['id_left'], X[i]['id_left']))
            all_X['id_right'] = np.hstack((all_X['id_right'], X[i]['id_right']))
            all_X['text_left'] = np.vstack((all_X['text_left'], X[i]['text_left']))
            all_X['text_right'] = np.vstack((all_X['text_right'], X[i]['text_right']))
            
            all_y = np.hstack((all_y, Y[i]))
            
            i += 1

        return all_X, all_y

In [3]:
term_index = pickle.load(open('data/term_index_99.pkl', 'rb'))
no_of_batches = 10

In [4]:
len(term_index)

9644

In [5]:
gen = DiskDataGenerator('data/', term_index, no_of_batches)

In [6]:
ranking_task = mz.tasks.Ranking()
ranking_task.metrics = [
    'mae', 'map', 'precision',
    mz.metrics.Precision(k=3),
    mz.metrics.DiscountedCumulativeGain(k=1),
    mz.metrics.DiscountedCumulativeGain(k=3),
    mz.metrics.DiscountedCumulativeGain(k=5),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
]
model = mz.models.DSSMModel()
input_shapes = [(len(term_index)+1,),(len(term_index)+1,)]
model.params['input_shapes'] = input_shapes
model.params['task'] = ranking_task
model.guess_and_fill_missing_params()
model.build()
model.compile()

Example when used with tensorflow-gpu

In [7]:
model.fit_generator(gen, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe27b4413c8>

Ran below code in virtualenv with normal tensorflow installed

In [8]:
model.fit_generator(gen, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f488e4146a0>