In [1]:
# %load train_nmt.py
from nmt import *
from pprint import pprint
from setup import setup
from data_iterator import TextIterator

model_options = setup('fren_bpe')
pprint(model_options)

# add random seed
model_options['trng'] = RandomStreams(19920206)
model_options['n_words_src'] = model_options['voc_sizes'][0]
model_options['n_words'] = model_options['voc_sizes'][1]

# load dictionaries and invert them
worddicts   = [None] * len(model_options['dictionaries'])
worddicts_r = [None] * len(model_options['dictionaries'])
for ii, dd in enumerate(model_options['dictionaries']):
    with open(dd, 'rb') as f:
        worddicts[ii] = pkl.load(f)
    worddicts_r[ii] = dict()
    for kk, vv in worddicts[ii].iteritems():
        worddicts_r[ii][vv] = kk

# reload options
if model_options['reload_'] and os.path.exists(model_options['saveto']):
    print 'Reloading model options'
    with open('%s.pkl' % saveto, 'rb') as f:
        model_options = pkl.load(f)

print 'Loading data'
train = TextIterator(model_options['datasets'], model_options['dictionaries'], model_options['voc_sizes'], 
                     batch_size=model_options['batch_size'], maxlen=model_options['maxlen'])
valid = TextIterator(model_options['valid_datasets'], model_options['dictionaries'], model_options['voc_sizes'], 
                     batch_size=model_options['batch_size'], maxlen=200)

@Timeit
def build_networks(options):
    funcs = dict()

    print 'Building model: E -> F & F -> E model'
    params_ef = init_params(options, 'ef_')
    params_fe = init_params(options, 'fe_')
    print 'Done.'

    # reload parameters
    if options['reload_'] and os.path.exists(options['saveto']):
        print 'Reloading model parameters'
        params_ef = load_params(options['saveto'], params_ef)
        params_fe = load_params(options['saveto'], params_fe)

    tparams_ef = init_tparams(params_ef)
    tparams_fe = init_tparams(params_fe)

    # inputs of the model (x1, y1, x2, y2)
    x1 = tensor.matrix('x1', dtype='int64')
    x1_mask = tensor.matrix('x1_mask', dtype='float32')
    y1 = tensor.matrix('y1', dtype='int64')
    y1_mask = tensor.matrix('y1_mask', dtype='float32')
    x2 = tensor.matrix('x2', dtype='int64')
    x2_mask = tensor.matrix('x2_mask', dtype='float32')
    y2 = tensor.matrix('y2', dtype='int64')
    y2_mask = tensor.matrix('y2_mask', dtype='float32')

    # TM reference index
    tef12 = tensor.matrix('ef12', dtype='int64')
    tef12_mask = tensor.matrix('ef12_mask', dtype='float32')
    tef21 = tensor.matrix('ef21', dtype='int64')
    tef21_mask = tensor.matrix('ef21_mask', dtype='float32')
    tfe12 = tensor.matrix('fe12', dtype='int64')
    tfe12_mask = tensor.matrix('fe12_mask', dtype='float32')
    tfe21 = tensor.matrix('fe21', dtype='int64')
    tfe21_mask = tensor.matrix('fe21_mask', dtype='float32')

    print 'build forward-attention models (4 models simultaneously)'
    ret_ef11 = build_model(tparams_ef, [x1, x1_mask, y1, y1_mask], options, 'ef_', False)  # E->F curr
    ret_fe11 = build_model(tparams_fe, [y1, y1_mask, x1, x1_mask], options, 'fe_', False)  # F->E curr
    ret_ef22 = build_model(tparams_ef, [x2, x2_mask, y2, y2_mask], options, 'ef_', False)  # E->F tm
    ret_fe22 = build_model(tparams_fe, [y2, y2_mask, x2, x2_mask], options, 'fe_', False)  # F->E tm

    print 'build cross-attention models'
    ret_ef12 = build_attender(tparams_ef,
                              [ret_ef11['prev_hids'], ret_ef11['prev_emb'], ret_ef22['ctx'], x2_mask],
                              options, 'ef_')  # E->F curr
    ret_ef21 = build_attender(tparams_ef,
                              [ret_ef22['prev_hids'], ret_ef22['prev_emb'], ret_ef11['ctx'], x1_mask],
                              options, 'ef_')  # E->F tm
    ret_fe12 = build_attender(tparams_fe,
                              [ret_fe11['prev_hids'], ret_fe11['prev_emb'], ret_fe22['ctx'], y2_mask],
                              options, 'fe_')  # F->E curr
    ret_fe21 = build_attender(tparams_fe,
                              [ret_fe22['prev_hids'], ret_fe22['prev_emb'], ret_fe11['ctx'], y1_mask],
                              options, 'fe_')  # F->E tm

    print 'build attentions (forward, cross-propagation)'

    def build_prop(atten_ef, atten_fe):
        atten_ef = atten_ef.dimshuffle(1, 0, 2)
        atten_fe = atten_fe.dimshuffle(1, 0, 2)
        attention = tensor.batched_dot(atten_ef, atten_fe).dimshuffle(1, 0, 2)
        return attention

    att_ef12 = build_prop(ret_ef12['attention'], ret_fe22['attention'])
    att_ef21 = build_prop(ret_ef21['attention'], ret_fe11['attention'])
    att_fe12 = build_prop(ret_fe12['attention'], ret_ef22['attention'])
    att_fe21 = build_prop(ret_fe21['attention'], ret_ef11['attention'])

    print 'build loss function (w/o gate)'

    # we first try the simplest version: use a natural attention-gate.
    # TODO: make it as a Neural Gate
    gate_ef1 = ret_ef11['att_sum'] / (ret_ef11['att_sum'] + ret_ef12['att_sum'])
    gate_ef2 = ret_ef22['att_sum'] / (ret_ef22['att_sum'] + ret_ef21['att_sum'])
    gate_fe1 = ret_fe11['att_sum'] / (ret_fe11['att_sum'] + ret_fe12['att_sum'])
    gate_fe2 = ret_fe22['att_sum'] / (ret_fe22['att_sum'] + ret_fe21['att_sum'])

    # get the loss function
    def compute_prob(probs, y, y_mask):

        # compute the loss for the vocabulary-selection side
        y_flat = y.flatten()
        y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
        probw = probs.flatten()[y_flat_idx]
        probw = probw.reshape([y.shape[0], y.shape[1]]) * y_mask
        return probw

    prob_ef11 = ret_ef11['probs']
    prob_ef22 = ret_ef22['probs']
    prob_fe11 = ret_fe11['probs']
    prob_fe22 = ret_fe22['probs']

    # get cost
    cost_ef1 = (-tensor.log(compute_prob(prob_ef11, y1, y1_mask) * gate_ef1 +
                            compute_prob(att_ef12, tef12, tef12_mask) * (1 - gate_ef1)
                            + 1e-8) * (1 - (1 - y1_mask) * (1 - tef12_mask))).sum(0)
    cost_ef2 = (-tensor.log(compute_prob(prob_ef22, y2, y2_mask) * gate_ef2 +
                            compute_prob(att_ef21, tef21, tef21_mask) * (1 - gate_ef2)
                            + 1e-8) * (1 - (1 - y2_mask) * (1 - tef21_mask))).sum(0)
    cost_fe1 = (-tensor.log(compute_prob(prob_fe11, x1, x1_mask) * gate_fe1 +
                            compute_prob(att_fe12, tfe12, tfe12_mask) * (1 - gate_fe1)
                            + 1e-8) * (1 - (1 - x1_mask) * (1 - tfe12_mask))).sum(0)
    cost_fe2 = (-tensor.log(compute_prob(prob_fe22, x2, x2_mask) * gate_fe2 +
                            compute_prob(att_fe21, tfe21, tfe21_mask) * (1 - gate_fe2)
                            + 1e-8) * (1 - (1 - x2_mask) * (1 - tfe21_mask))).sum(0)

    cost = cost_ef1 + cost_ef2 + cost_fe1 + cost_fe2

    # print 'Building sampler'
    # f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    # before any regularizer
    print 'Building Cost Function...',
    inputs = [x1, x1_mask, y1, y1_mask, x2, x2_mask, y2, y2_mask,
              tef12, tef12_mask, tef21, tef21_mask,
              tfe12, tfe12_mask, tfe21, tfe21_mask]

    # f_cost = theano.function(inputs, cost, profile=profile)
    # print 'Done'

    cost = cost.mean()

    print 'Build Gradient (backward)...',

    tparams = dict(tparams_ef.items() + tparams_fe.items())
    grads   = clip(tensor.grad(cost, wrt=itemlist(tparams)), options['clip_c'])
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building Optimizers...',
    f_cost, f_update = eval(options['optimizer'])(lr, tparams, grads, inputs, cost)

    funcs['cost']   = f_cost
    funcs['update'] = f_update

    print 'Done'
    return funcs

#funcs = build_networks(model_options)

print '..Upto here.'


Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5105)


{'batch_size': 32,
 'beamsize': 5,
 'clip_c': 1.0,
 'd_maxlen': 200,
 'datasets': ['/root/workspace/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.shuf',
              '/root/workspace/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.shuf',
              '/root/workspace/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.shuf',
              '/root/workspace/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.shuf'],
 'decay_c': 0.0,
 'decoder': 'gru_cond',
 'dictionaries': ['/root/workspace/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.pkl',
                  '/root/workspace/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.pkl',
                  '/root/workspace/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.pkl',
                  '/root/workspace/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.pkl'],
 'dim': 1024,
 'dim_word': 512,
 'dispFreq': 10,
 'encoder': 'gru',
 'lrate': 2e-05,
 'maxlen': 80,
 'normalize': False,
 'optimizer': 'adam',
 'overwrite': False,
 'patience': 10,
 'reload_': True,
 'sampleFreq': 100,
 'saveFreq': 100,
 'savet

In [5]:
train.reset()
for x1, y1, x2, y2 in train:
    print x1[0]
    print x2[0]
    print y1[0]
    print y2[0]
    break

[179, 349, 4, 10, 3, 5, 2, 33, 218, 19, 42, 20, 36, 16, 64, 3485, 22, 364, 19, 54, 6, 12, 191, 1456, 112, 1489, 13, 53, 4843, 14, 2118, 15, 14, 3320, 2872, 43, 7, 118, 2337, 17, 80, 42, 35, 76, 857, 6, 1006, 99, 96, 4, 223, 25, 1427, 15, 1489, 13, 53, 6389, 25, 7, 44, 23, 27, 212, 18, 3, 5, 2, 27, 228, 13, 467, 4, 7, 129, 18, 3, 5, 2, 468, 21, 189, 19, 80, 42, 11]
[179, 349, 4, 10, 3, 5, 2, 33, 218, 19, 42, 20, 36, 16, 64, 3485, 22, 364, 19, 54, 6, 12, 191, 1456, 112, 1489, 13, 53, 4843, 14, 2118, 15, 14, 3320, 2872, 43, 7, 118, 2337, 17, 80, 42, 35, 76, 857, 6, 1006, 99, 96, 4, 223, 25, 1427, 15, 1489, 13, 53, 6389, 25, 7, 44, 23, 27, 212, 18, 3, 5, 2, 27, 228, 13, 467, 4, 7, 129, 18, 3, 5, 2, 468, 21, 189, 19, 80, 42, 11]
[218, 472, 10, 24, 226, 4, 51, 33, 13, 41, 12, 37, 3181, 17, 347, 7, 1055, 215, 53, 280, 2624, 4, 2, 847, 2304, 14, 2965, 20, 2, 155, 4, 40, 34, 33, 140, 194, 7, 352, 79, 2406, 1473, 11, 1642, 14, 126, 10, 18, 4032, 10069, 21, 2, 36, 93, 132, 727, 4, 2, 131, 4, 399,