In [1]:
from nmt import *
from pprint import pprint
from setup import setup
from data_iterator import TextIterator, prepare_data, prepare_cross

import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument('-m', type=str, default='fren')
# args = parser.parse_args()

model_options = setup('fren_bpe')
pprint(model_options)

# add random seed
model_options['rng']  = numpy.random.RandomState(seed=19920206)
model_options['trng'] = RandomStreams(model_options['rng'].randint(0, 2**32-1))
model_options['n_words_src'] = model_options['voc_sizes'][0]
model_options['n_words'] = model_options['voc_sizes'][1]


# load dictionaries and invert them
worddicts   = [None] * len(model_options['dictionaries'])
worddicts_r = [None] * len(model_options['dictionaries'])
for ii, dd in enumerate(model_options['dictionaries']):
    with open(dd, 'rb') as f:
        worddicts[ii] = pkl.load(f)
    worddicts_r[ii] = dict()
    for kk, vv in worddicts[ii].iteritems():
        worddicts_r[ii][vv] = kk

# reload options
if model_options['reload_'] and os.path.exists(model_options['saveto']):
    print 'Reloading model options'
    with open('%s.pkl' % model_options['saveto'], 'rb') as f:
        model_options = pkl.load(f)

@Timeit
def build_networks(options):
    funcs = dict()

    print 'Building model: E -> F & F -> E model'
    params_ef = init_params(options, 'ef_')
    params_fe = init_params(options, 'fe_')
    print 'Done.'

    # reload parameters
    if options['reload_'] and os.path.exists(options['saveto']):
        print 'Reloading model parameters'
        params_ef = load_params(options['saveto'], params_ef)
        params_fe = load_params(options['saveto'], params_fe)

    tparams_ef = init_tparams(params_ef)
    tparams_fe = init_tparams(params_fe)

    # inputs of the model (x1, y1, x2, y2)
    x1 = tensor.matrix('x1', dtype='int64')
    x1_mask = tensor.matrix('x1_mask', dtype='float32')
    y1 = tensor.matrix('y1', dtype='int64')
    y1_mask = tensor.matrix('y1_mask', dtype='float32')
    x2 = tensor.matrix('x2', dtype='int64')
    x2_mask = tensor.matrix('x2_mask', dtype='float32')
    y2 = tensor.matrix('y2', dtype='int64')
    y2_mask = tensor.matrix('y2_mask', dtype='float32')

    # TM reference index
    tef12 = tensor.matrix('ef12', dtype='int64')
    tef12_mask = tensor.matrix('ef12_mask', dtype='float32')
    tef21 = tensor.matrix('ef21', dtype='int64')
    tef21_mask = tensor.matrix('ef21_mask', dtype='float32')
    tfe12 = tensor.matrix('fe12', dtype='int64')
    tfe12_mask = tensor.matrix('fe12_mask', dtype='float32')
    tfe21 = tensor.matrix('fe21', dtype='int64')
    tfe21_mask = tensor.matrix('fe21_mask', dtype='float32')

    print 'build forward-attention models (4 models simultaneously)...'
    ret_ef11 = build_model(tparams_ef, [x1, x1_mask, y1, y1_mask], options, 'ef_', False, True)   # E->F curr
    ret_fe11 = build_model(tparams_fe, [y1, y1_mask, x1, x1_mask], options, 'fe_', False, False)  # F->E curr
    ret_ef22 = build_model(tparams_ef, [x2, x2_mask, y2, y2_mask], options, 'ef_', False, True)   # E->F tm
    ret_fe22 = build_model(tparams_fe, [y2, y2_mask, x2, x2_mask], options, 'fe_', False, False)  # F->E tm

    print 'build cross-attention models'
    ret_ef12 = build_attender(tparams_ef,
                              [ret_ef11['prev_hids'], ret_ef11['prev_emb'], ret_ef22['ctx'], x2_mask],
                              options, 'ef_')  # E->F curr
    ret_ef21 = build_attender(tparams_ef,
                              [ret_ef22['prev_hids'], ret_ef22['prev_emb'], ret_ef11['ctx'], x1_mask],
                              options, 'ef_')  # E->F tm
    ret_fe12 = build_attender(tparams_fe,
                              [ret_fe11['prev_hids'], ret_fe11['prev_emb'], ret_fe22['ctx'], y2_mask],
                              options, 'fe_')  # F->E curr
    ret_fe21 = build_attender(tparams_fe,
                              [ret_fe22['prev_hids'], ret_fe22['prev_emb'], ret_fe11['ctx'], y1_mask],
                              options, 'fe_')  # F->E tm

    print 'build attentions (forward, cross-propagation)'

    def build_prop(atten_ef, atten_fe):
        atten_ef = atten_ef.dimshuffle(1, 0, 2)
        atten_fe = atten_fe.dimshuffle(1, 0, 2)
        attention = tensor.batched_dot(atten_ef, atten_fe).dimshuffle(1, 0, 2)
        return attention

    att_ef12 = build_prop(ret_ef12['attention'], ret_fe22['attention'])
    att_ef21 = build_prop(ret_ef21['attention'], ret_fe11['attention'])
    att_fe12 = build_prop(ret_fe12['attention'], ret_ef22['attention'])
    att_fe21 = build_prop(ret_fe21['attention'], ret_ef11['attention'])

    print 'build gates!'
    params_gate  = OrderedDict()
    params_gate  = get_layer('bi')[0](options, params_gate, nin=2 * options['dim'])
    tparams_gate = init_tparams(params_gate)

    # a neural gate which is the relatedness of two attentions.
    def build_gate(ctx1, ctx2):
        return get_layer('bi')[1](tparams_gate, ctx1, ctx2)

    gate_ef1 = 1 - build_gate(ret_ef11['ctxs'], ret_ef12['ctxs'])
    gate_ef2 = 1 - build_gate(ret_ef22['ctxs'], ret_ef21['ctxs'])
    gate_fe1 = 1 - build_gate(ret_fe11['ctxs'], ret_fe12['ctxs'])
    gate_fe2 = 1 - build_gate(ret_fe22['ctxs'], ret_fe21['ctxs'])

    print 'Building Gate functions, ...',
    f_gate = theano.function([ret_ef11['ctxs'], ret_ef12['ctxs']],
                              gate_ef1, profile=profile)
    print 'Done.'

    # gate_ef1 = ret_ef11['att_sum'] / (ret_ef11['att_sum'] + ret_ef12['att_sum'])
    # gate_ef2 = ret_ef22['att_sum'] / (ret_ef22['att_sum'] + ret_ef21['att_sum'])
    # gate_fe1 = ret_fe11['att_sum'] / (ret_fe11['att_sum'] + ret_fe12['att_sum'])
    # gate_fe2 = ret_fe22['att_sum'] / (ret_fe22['att_sum'] + ret_fe21['att_sum'])

    print 'build loss function (w/o gate)'

    # get the loss function
    def compute_prob(probs, y, y_mask):

        # compute the loss for the vocabulary-selection side
        y_flat  = y.flatten()
        n_words = probs.shape[-1]
        y_flat_idx = tensor.arange(y_flat.shape[0]) * n_words + y_flat
        probw = probs.flatten()[y_flat_idx]
        probw = probw.reshape([y.shape[0], y.shape[1]]) * y_mask
        return probw

    prob_ef11 = ret_ef11['probs']
    prob_ef22 = ret_ef22['probs']
    prob_fe11 = ret_fe11['probs']
    prob_fe22 = ret_fe22['probs']

    # get cost
    cost_ef1 = (-tensor.log(compute_prob(prob_ef11, y1, y1_mask) * gate_ef1 +
                            compute_prob(att_ef12, tef12, tef12_mask) * (1 - gate_ef1)
                            + 1e-8) * (1 - (1 - y1_mask) * (1 - tef12_mask))).sum(0)
    cost_ef2 = (-tensor.log(compute_prob(prob_ef22, y2, y2_mask) * gate_ef2 +
                            compute_prob(att_ef21, tef21, tef21_mask) * (1 - gate_ef2)
                            + 1e-8) * (1 - (1 - y2_mask) * (1 - tef21_mask))).sum(0)
    cost_fe1 = (-tensor.log(compute_prob(prob_fe11, x1, x1_mask) * gate_fe1 +
                            compute_prob(att_fe12, tfe12, tfe12_mask) * (1 - gate_fe1)
                            + 1e-8) * (1 - (1 - x1_mask) * (1 - tfe12_mask))).sum(0)
    cost_fe2 = (-tensor.log(compute_prob(prob_fe22, x2, x2_mask) * gate_fe2 +
                            compute_prob(att_fe21, tfe21, tfe21_mask) * (1 - gate_fe2)
                            + 1e-8) * (1 - (1 - x2_mask) * (1 - tfe21_mask))).sum(0)

    cost = cost_ef1 + cost_ef2 + cost_fe1 + cost_fe2

    print 'build sampler (one-step)'
    f_init_ef, f_next_ef = build_sampler(tparams_ef, options, options['trng'], 'ef_')
    f_init_fe, f_next_fe = build_sampler(tparams_fe, options, options['trng'], 'fe_')

    print 'build attender (one-step)'
    f_attend_ef = build_attender(tparams_ef, None, options, 'ef_', one_step=True)  # E->F curr
    f_attend_fe = build_attender(tparams_fe, None, options, 'fe_', one_step=True)

    # before any regularizer
    print 'build Cost Function...',
    inputs = [x1, x1_mask, y1, y1_mask, x2, x2_mask, y2, y2_mask,
              tef12, tef12_mask, tef21, tef21_mask,
              tfe12, tfe12_mask, tfe21, tfe21_mask]
    f_valid = theano.function(inputs, cost, profile=profile)

    print 'build Gradient (backward)...',
    cost    = cost.mean()
    tparams = dict(tparams_ef.items() + tparams_fe.items() + tparams_gate.items())
    grads   = clip(tensor.grad(cost, wrt=itemlist(tparams)), options['clip_c'])
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building Optimizers...',
    f_cost, f_update = eval(options['optimizer'])(lr, tparams, grads, inputs, cost)

    print 'Done'

    # put everything into function lists
    funcs['valid']  = f_valid
    funcs['cost']   = f_cost
    funcs['update'] = f_update

    funcs['init_ef'] = f_init_ef
    funcs['init_fe'] = f_init_fe
    funcs['next_ef'] = f_next_ef
    funcs['next_fe'] = f_next_fe

    funcs['att_ef']  = f_attend_ef
    funcs['att_fe']  = f_attend_fe

    funcs['crit_ef'] = ret_ef11['f_critic']
    funcs['crit_fe'] = ret_ef22['f_critic']

    funcs['gate']    = f_gate

    print 'Build Networks... done!'
    return funcs, tparams

funcs, tparams = build_networks(model_options)

print '..Upto here.'

Using gpu device 3: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)


{'batch_size': 32,
 'beamsize': 5,
 'clip_c': 1.0,
 'd_maxlen': 200,
 'datasets': ['/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.shuf',
              '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.shuf',
              '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.shuf',
              '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.shuf'],
 'decay_c': 0.0,
 'decoder': 'gru_cond',
 'dictionaries': ['/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.pkl',
                  '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.pkl',
                  '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.fr.tok.bpe.pkl',
                  '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT/.dataset/fren.bpe/train.en.tok.bpe.pkl'],
 'dim': 1024,
 'dim_

In [33]:
# generate sample, either with stochastic sampling or beam search. Note that,
# this function iteratively calls f_init and f_next functions.
def get_sample(tparams,
               funcs,
               x1, x2, y2,
               options,
               rng=None,
               m=0,
               k=1,         # beam-size
               maxlen=200,
               stochastic=True,
               argmax=False):

    # modes
    modes   = ['ef', 'fe']

    # masks
    x1_mask = numpy.ones_like(x1, dtype='float32')
    x2_mask = numpy.ones_like(x2, dtype='float32')
    y2_mask = numpy.ones_like(y2, dtype='float32')

    # k is the beam size we have
    if k > 1:
        assert not stochastic, 'Beam search does not support stochastic sampling'

    sample = []
    sample_score = []
    if stochastic:
        sample_score = 0

    live_k = 1
    dead_k = 0

    hyp_samples = [[]] * live_k
    hyp_scores = numpy.zeros(live_k).astype('float32')
    hyp_states = []

    # get initial state of decoder rnn and encoder context for x1
    ret = funcs['init_'+modes[m]](x1)
    next_state, ctx0 = ret[0], ret[1]   # init-state, contexts
    next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator

    # get translation memory encoder context
    _, mctx0 = funcs['init_'+modes[m]](x2)

    # get attention propagation for translation memory
    atts, _ = funcs['crit_'+modes[1-m]](y2, y2_mask, x2, x2_mask)
    atts = numpy.squeeze(atts)

    for ii in xrange(maxlen):
        ctx  = numpy.tile(ctx0,  [live_k, 1])
        mctx = numpy.tile(mctx0, [live_k, 1])

        # --copy mode
        ret  = funcs['att_'+modes[m]](next_state, next_w, mctx)
        mctxs, matt = ret[0], ret[1]    # matt: batchsize x len_x2        
        copy_p = numpy.dot(matt, atts)  # batchsize x len_y2

        # --generate mode
        ret = funcs['next_'+modes[m]](next_w, ctx, next_state)
        next_p, next_w, next_state, ctxs = ret[0], ret[1], ret[2], ret[3]

        # compute gate
        gates = funcs['gate'](ctxs[None, :, :], mctxs[None, :, :])[0]  # batchsize

        # real probabilities
        next_p *= (1 - gates[:, None])
        copy_p *= gates[:, None]

        def _merge():
            temp_p = copy.copy(numpy.concatenate([next_p, copy_p], axis=1))
            lmax = next_p.shape[1]
            for i in range(next_p.shape[0]):
                for j in range(copy_p.shape[1]):
                    if y2[j] != 1:
                        temp_p[i, y2[j]] += copy_p[i, j]
                        temp_p[i, lmax + j] = 0.
            temp_p -= 1e-8
            return temp_p

        merge_p = _merge()
        
        if stochastic:
            if argmax:
                nw = merge_p[0].argmax()
                next_w[0] = nw
            else:
                nw = rng.multinomial(1, pvals=merge_p[0]).argmax()
                
            sample.append(nw)
            sample_score -= numpy.log(merge_p[0, nw])
            if nw == 0:
                break
        else:
            cand_scores = hyp_scores[:, None] - numpy.log(next_p)
            cand_flat = cand_scores.flatten()
            ranks_flat = cand_flat.argsort()[:(k-dead_k)]

            voc_size = next_p.shape[1]
            trans_indices = ranks_flat / voc_size
            word_indices = ranks_flat % voc_size
            costs = cand_flat[ranks_flat]

            new_hyp_samples = []
            new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
            new_hyp_states = []

            for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
                new_hyp_samples.append(hyp_samples[ti]+[wi])
                new_hyp_scores[idx] = copy.copy(costs[idx])
                new_hyp_states.append(copy.copy(next_state[ti]))

            # check the finished samples
            new_live_k = 0
            hyp_samples = []
            hyp_scores = []
            hyp_states = []

            for idx in xrange(len(new_hyp_samples)):
                if new_hyp_samples[idx][-1] == 0:
                    sample.append(new_hyp_samples[idx])
                    sample_score.append(new_hyp_scores[idx])
                    dead_k += 1
                else:
                    new_live_k += 1
                    hyp_samples.append(new_hyp_samples[idx])
                    hyp_scores.append(new_hyp_scores[idx])
                    hyp_states.append(new_hyp_states[idx])
            hyp_scores = numpy.array(hyp_scores)
            live_k = new_live_k

            if new_live_k < 1:
                break
            if dead_k >= k:
                break

            next_w = numpy.array([w[-1] for w in hyp_samples])
            next_state = numpy.array(hyp_states)

    if not stochastic:
        # dump every remaining one
        if live_k > 0:
            for idx in xrange(live_k):
                sample.append(hyp_samples[idx])
                sample_score.append(hyp_scores[idx])

    return sample, sample_score



In [2]:
print 'Loading data'
train = TextIterator(model_options['datasets'], model_options['dictionaries'], model_options['voc_sizes'], 
                     batch_size=16, maxlen=model_options['maxlen'])
valid = TextIterator(model_options['valid_datasets'], model_options['dictionaries'], model_options['voc_sizes'], 
                     batch_size=model_options['batch_size'], maxlen=200)
print 'done.'

Loading data
done.


In [38]:
# load pretrained NMT models
home = '/misc/kcgscratch1/ChoGroup/thoma_exp/memory/TMNMT'
model_ef = home + '/.model/baseline_fren.bpe.npz'
model_fe = home + '/.model/baseline_fren.bpe.npz'


params = unzip(tparams)
#load_params2(model_ef, )

In [42]:
# load parameters
def load_params2(path, params, mode=''):
    pp = numpy.load(path)
    for kk, vv in params.iteritems():
        if kk[:3] == mode:
            if kk[3:] not in pp:
                warnings.warn('%s is not in the archive' % kk)
                continue
            print 'load ...', kk
            params[kk] = pp[kk[3:]]

    return params

params = load_params2(model_ef, params, mode='ef_')
params = load_params2(model_fe, params, mode='fe_')

load ... ef_ff_state_W
load ... ef_decoder_W_comb_att
load ... ef_Wemb_dec
load ... ef_decoder_U_att
load ... ef_encoder_r_U
load ... ef_encoder_r_W
load ... ef_ff_logit_b
load ... ef_encoder_bx
load ... ef_encoder_r_Ux
load ... ef_decoder_bx
load ... ef_decoder_b_nl
load ... ef_decoder_Ux
load ... ef_ff_state_b
load ... ef_encoder_Ux
load ... ef_encoder_r_Wx
load ... ef_decoder_bx_nl
load ... ef_ff_logit_W
load ... ef_encoder_r_b
load ... ef_decoder_Wx
load ... ef_ff_logit_lstm_W
load ... ef_ff_logit_prev_b
load ... ef_ff_logit_ctx_b
load ... ef_decoder_Wcx
load ... ef_decoder_b
load ... ef_encoder_U
load ... ef_decoder_Wc
load ... ef_encoder_W
load ... ef_decoder_b_att
load ... ef_decoder_Wc_att
load ... ef_decoder_U_nl
load ... ef_decoder_U
load ... ef_decoder_W
load ... ef_encoder_r_bx
load ... ef_decoder_c_tt
load ... ef_encoder_b
load ... ef_ff_logit_lstm_b
load ... ef_decoder_Ux_nl
load ... ef_Wemb
load ... ef_encoder_Wx
load ... ef_ff_logit_ctx_W
load ... ef_ff_logit_prev_W
loa

In [52]:
for k, (sx1, sy1, sx2, sy2) in enumerate(train):
        x1, x1_mask = prepare_data(sx1, model_options['maxlen'], model_options['voc_sizes'][0])
        y1, y1_mask = prepare_data(sy1, model_options['maxlen'], model_options['voc_sizes'][1])
        x2, x2_mask = prepare_data(sx2, model_options['maxlen'], model_options['voc_sizes'][2])
        y2, y2_mask = prepare_data(sy2, model_options['maxlen'], model_options['voc_sizes'][3])

        tx12, tx12_mask = prepare_cross(sx1, sx2, x1.shape[0])
        tx21, tx21_mask = prepare_cross(sx2, sx1, x2.shape[0])
        ty12, ty12_mask = prepare_cross(sy1, sy2, y1.shape[0])
        ty21, ty21_mask = prepare_cross(sy1, sy2, y2.shape[0])

        print 'x1:{}, x2:{}, y1:{}, y2:{}'.format(x1.shape, x2.shape, y1.shape, y2.shape)
        
        break

x1:(55, 16), x2:(55, 16), y1:(43, 16), y2:(43, 16)


In [43]:
zipp(params, tparams)

In [53]:
jj = 0
sample, score = get_sample(tparams, funcs,
                           x1[:, jj][:, None],
                           x2[:, jj][:, None],
                           y2[:, jj][:, None],
                           model_options,
                           rng=model_options['rng'],
                           m=1,
                           k=1,
                           maxlen=200,
                           stochastic=True,
                           argmax=False)

In [54]:
print 'Source ', jj, ': ',
for vv in x1[:, jj]:
    if vv == 0:
        break
    if vv in worddicts_r[0]:
        print worddicts_r[0][vv],
    else:
        print 'UNK',
print
print 'Truth ', jj, ' : ',
for vv in y1[:, jj]:
    if vv == 0:
        break
    if vv in worddicts_r[1]:
        print worddicts_r[1][vv],
    else:
        print 'UNK',
print
print 'Sample ', jj, ': ',
ss=sample
for vv in ss:
    if vv == 0:
        break
    if vv in worddicts_r[1]:
        print worddicts_r[1][vv],
    else:
        print 'UNK',
print

Source  0 :  3 . Un bénéficiaire diffé@@ ré qui en fait la demande reçoit du responsable de la gestion du régime complémentaire de pension , des informations sur ses droits à pension dor@@ m@@ ants et sur tout changement des règles régissant le régime complémentaire de pension qui les concerne .
Truth  0  :  3 . A deferred beneficiary who so requests shall receive from the person responsible for managing the supplementary pension scheme information on d@@ orm@@ ant pension rights and on all changes to the rules governing the supplementary pension scheme concerning them .
Sample  0 :  3 . A deferred person who has that controller shall have the him of the extensive pension scheme and and on Rec@@ pension rights and the any changes to the rules of the additional pension scheme .
