In [53]:
import os
import sys
import numpy as np
import theano.tensor as T
import codecs
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu2")
from collections import Counter
import math
import copy

In [3]:
sys.path.append('/usr0/home/glample/Research/perso/UltraDeep/')

In [5]:
from network import LSTM
from layer import HiddenLayer, EmbeddingLayer
from learning_method import LearningMethod

In [6]:
path_to_en = '/usr0/home/glample/Research/IncrementalMT/news-commentary-v8.fr-en.clean.en'
path_to_fr = '/usr0/home/glample/Research/IncrementalMT/news-commentary-v8.fr-en.clean.fr'

In [9]:
english_sentences = [line.strip().split() for line in codecs.open(path_to_en, 'r', encoding='utf-8')]
french_sentences = [line.strip().split() for line in codecs.open(path_to_fr, 'r', encoding='utf-8')]

In [10]:
assert len(english_sentences) == len(french_sentences)

In [11]:
train_sentences_eng = english_sentences[:int(0.95 * len(english_sentences))]
dev_sentences_eng = english_sentences[int(0.95 * len(english_sentences)):int(0.97 * len(english_sentences))]
test_sentences_en = english_sentences[int(0.97 * len(english_sentences)):]

train_sentences_fr = french_sentences[:int(0.95 * len(french_sentences))]
dev_sentences_fr = french_sentences[int(0.95 * len(french_sentences)):int(0.97 * len(french_sentences))]
test_sentences_fr = french_sentences[int(0.97 * len(french_sentences)):]

In [14]:
source_words = set()
for sentence in english_sentences:
    for word in sentence:
        source_words.add(word)
source_words.add('<s>')
source_words.add('</s>')

In [15]:
target_words = set()
for sentence in french_sentences:
    for word in sentence:
        target_words.add(word)
target_words.add('<s>')
target_words.add('</s>')

In [18]:
source_word2ind = {word:ind for ind, word in enumerate(source_words)}
source_ind2word = {ind:word for ind, word in enumerate(source_words)}
target_word2ind = {word:ind for ind, word in enumerate(target_words)}
target_ind2word = {ind:word for ind, word in enumerate(target_words)}

In [19]:
src_inp = T.ivector()
tgt_inp = T.ivector()
tgt_op = T.ivector()
index = T.scalar()

In [21]:
src_embedding_layer = EmbeddingLayer(input_dim=len(source_word2ind), output_dim=64)
tgt_embedding_layer = EmbeddingLayer(input_dim=len(target_word2ind), output_dim=64)
src_lstm_forward = LSTM(input_dim=src_embedding_layer.output_dim, hidden_dim=128, with_batch=False)
src_lstm_backward = LSTM(input_dim=src_embedding_layer.output_dim, hidden_dim=128, with_batch=False)
tgt_lstm = LSTM(input_dim=tgt_embedding_layer.output_dim, hidden_dim=2 * src_lstm_forward.hidden_dim, with_batch=False)
tgt_projection_layer = HiddenLayer(input_dim=tgt_lstm.hidden_dim * 2, output_dim=len(target_word2ind))

In [22]:
src_inp_t = np.random.rand(5,).astype(np.int32)
tgt_inp_t = np.random.rand(5,).astype(np.int32)
tgt_op_t = np.random.rand(5,).astype(np.int32)

In [24]:
#
# Model
#
src_emb_dim      = 256  # source word embedding dimension
tgt_emb_dim      = 256  # target word embedding dimension
src_lstm_hid_dim = 512  # source LSTMs hidden dimension
tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
proj_dim         = 104  # size of the first projection layer
dropout          = 0.5  # dropout rate

n_src = len(source_word2ind)  # number of words in the source language
n_tgt = len(target_word2ind)  # number of words in the target language

# Parameters
params = []

# Source words + target words embeddings layer
src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for source words
tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') # lookup table for target words
params += src_lookup.params + tgt_lookup.params

# LSTMs
src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False)
src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False)
tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False)
params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

# Projection layers
proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax')
proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
params += proj_layer1.params # + proj_layer2.params

In [None]:
is_train_t = 1

src_sentence_t = [3, 4, 2]
tgt_sentence_t = [1, 8, 0, 8, 2]
tgt_gold_t = [1, 3, 2, 2, 1]
beta = 500


# Train status
is_train = T.iscalar('is_train')
# Input sentence
src_sentence = T.ivector()
# Current output translation
tgt_sentence = T.ivector()
# Gold translation
tgt_gold = T.ivector()

src_sentence_emb = src_lookup.link(src_sentence)
tgt_sentence_emb = tgt_lookup.link(tgt_sentence)
print 'src_sentence_emb', src_sentence_emb.eval({src_sentence: src_sentence_t}).shape
print 'tgt_sentence_emb', tgt_sentence_emb.eval({tgt_sentence: tgt_sentence_t}).shape

src_lstm_for.link(src_sentence_emb)
src_lstm_rev.link(src_sentence_emb[::-1, :])

print 'src_lstm_for.h', src_lstm_for.h.eval({src_sentence: src_sentence_t}).shape
print 'src_lstm_rev.h', src_lstm_rev.h.eval({src_sentence: src_sentence_t}).shape

src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1)
print 'src_context', src_context.eval({src_sentence: src_sentence_t}).shape

tgt_lstm.h_0 = src_context[-1]
#repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0)
#repeated_src_context = proj_layer2.link(repeated_src_context)
#print 'repeated src_context', repeated_src_context.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
#tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1)
print 'tgt sentence emb', tgt_sentence_emb.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
tgt_lstm.link(tgt_sentence_emb)
print 'tgt_lstm.h', tgt_lstm.h.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

transition = tgt_lstm.h.dot(src_context.transpose())
transition = transition.dot(src_context)
print 'transition', transition.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
# print 'transition_matrix', transition_matrix.eval({src_sentence: src_sentence_t}).shape
# print 'transition_matrix.dot(tgt_lstm.output)', src_context.transpose().dot(src_context.dot(tgt_lstm.output)).eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
# print 'transition_matrix.dot(tgt_lstm.output)', tgt_lstm.h.dot(transition_matrix).eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)
print 'transition_last', transition_last.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

prediction = proj_layer1.link(transition_last)
print 'prediction', prediction.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf

print 'cost', cost.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t, tgt_gold: tgt_gold_t})

In [26]:
updates=LearningMethod(clip=5.0).get_updates('adam', cost, params)

In [27]:
f_train = theano.function(
    inputs=[src_sentence, tgt_sentence, tgt_gold],
    outputs=cost,
    updates=updates
)

In [28]:
f_eval = theano.function(
    inputs=[src_sentence, tgt_sentence],
    outputs=prediction,
)

In [None]:
def get_validation_predictions():
    validation_predictions = []    
    for ind, sent in enumerate(dev_sentences_eng[:100]):
        
        if ind % 300 == 0:
            print ind, len(dev_sentences_eng)
        src_words = np.array([source_word2ind[x] for x in sent]).astype(np.int32)
        current_outputs = [target_word2ind['<s>']]

        while True:
            next_word = f_eval(src_words, current_outputs).argmax(axis=1)[-1]
            current_outputs.append(next_word)
            #print [target_ind2word[x] for x in current_outputs]
            if next_word == target_word2ind['</s>'] or len(current_outputs) >= 15:
                validation_predictions.append([target_ind2word[x] for x in current_outputs])
                break
    return validation_predictions

In [39]:
def get_test_predictions():
    test_predictions = []    
    for ind, sent in enumerate(test_sentences_eng):
        
        if ind % 300 == 0:
            print ind, len(test_sentences_eng)
        src_words = np.array([source_word2ind[x] for x in sent]).astype(np.int32)
        current_outputs = [target_word2ind['<s>']]

        while True:
            next_word = f_eval(src_words, current_outputs).argmax(axis=1)[-1]
            current_outputs.append(next_word)
            #print [target_ind2word[x] for x in current_outputs]
            if next_word == target_word2ind['</s>'] or len(current_outputs) >= 15:
                test_predictions.append([target_ind2word[x] for x in current_outputs])
                break
    return test_predictions

In [61]:
print ' '.join(valid_preds[900])

<s> , les politiques ne pas des des politiques pour des , peut des idéales


In [60]:
dev_sentences_eng[900]

[u'though',
 u'it',
 u'is',
 u'ultimately',
 u'the',
 u'Egyptian',
 u'people',
 u'who',
 u'will',
 u'decide',
 u'the',
 u'country',
 u'\u2019',
 u's',
 u'fate',
 u',',
 u'and',
 u'whether',
 u'it',
 u'can',
 u'finally',
 u'take',
 u'decisive',
 u'steps',
 u'towards',
 u'more',
 u'inclusive',
 u'political',
 u'institutions',
 u',',
 u'this',
 u'does',
 u'not',
 u'mean',
 u'that',
 u'outsiders',
 u'can',
 u'do',
 u'nothing',
 u'.']

In [112]:
best_valid_preds = None
best_valid_score = -sys.maxint
best_test_preds = None

In [None]:
f = open('blue_valid_log.txt', 'w')
all_costs = []
batch_size = 50
n_epochs = 100
for i in xrange(n_epochs):
    print 'Starting epoch %i' % i
    indices = range(len(train_sentences_eng))
    np.random.shuffle(indices)
    train_src_batch = [train_sentences_eng[ind] for ind in indices]
    train_tgt_batch = [train_sentences_fr[ind] for ind in indices]
    assert len(train_src_batch) == len(train_tgt_batch)
    costs = []
    for j in xrange(len(train_src_batch)):
        #s_sent, s_length, t_inp, t_op, mask = get_batch(train_src_batch[j:j + batch_size], train_tgt_batch[j:j+batch_size])
        new_cost = f_train(
            np.array([source_word2ind['<s>']] + [source_word2ind[x] for x in train_src_batch[j]] + [source_word2ind['</s>']]).astype(np.int32),
            np.array([target_word2ind['<s>']] + [target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
            np.array([target_word2ind[x] for x in train_tgt_batch[j]][1:] + [target_word2ind['</s>']]).astype(np.int32),
        )
        all_costs.append((j, new_cost))
        costs.append(new_cost)
        if j % 300 == 0:
            print j, np.mean(costs)
            costs = []
        if np.isnan(new_cost):
            print 'NaN detected.'
            break
        if j % 10000 == 0 and j != 0:
            valid_preds = get_validation_predictions()
            print '==================================================================='
            print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
            print '==================================================================='
            if float(get_validation_bleu(valid_preds)) >= best_valid_score:
                best_valid_score = float(get_validation_bleu(valid_preds))
                best_valid_preds = copy.deepcopy(valid_preds)
                #best_test_preds = get_test_predictions()
                print 'Found new best validation score %f ' % (best_valid_score)
            f.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, get_validation_bleu(valid_preds)))

    if np.isnan(new_cost):
        print 'NaN detected.'
        break
    valid_preds = get_validation_predictions()
    print '==================================================================='
    print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
    print '==================================================================='
f.close()

Starting epoch 0
0 3.18894
300 3.31635
600 3.31026
900 3.24681
1200 3.25304
1500 3.31975
1800 3.31634
2100 3.26617
2400 3.40801
2700 3.32076
3000 3.41637
3300 3.20238
3600 3.45898
3900 3.29132
4200 3.35515
4500 3.44128
4800 3.29924
5100 3.34561
5400 3.3237
5700 3.33885
6000 3.34944
6300 3.25173
6600 3.35672
6900 3.20916
7200 3.39585
7500 3.41353
7800 3.4373
8100 3.39742
8400 3.35773
8700 3.44127
9000 3.40045
9300 3.29823
9600 3.42785
9900 3.38059
0 3108
300 3108
600 3108
900 3108
1200 3108
1500 3108
1800 3108
2100 3108
2400 3108
2700 3108
3000 3108
Epoch 0 BLEU on Validation : 0.00 
Found new best validation score 0.000000 
10200 3.36053
10500 3.40325
10800 3.39586
11100 3.30332
11400 3.33042
11700 3.29486
12000 3.39685
12300 3.44464
12600 3.42495
12900 3.36286
13200 3.34129
13500 3.51194
13800 3.41034
14100 3.32053
14400 3.40615
14700 3.41174
15000 3.38293
15300 3.54414
15600 3.47826
15900 3.48069
16200 3.42324
16500 3.33668
16800 3.40387
17100 3.41698
17400 3.53604
17700 3.50198
1800

KeyboardInterrupt: 

In [114]:
best_valid_score

0.0

In [109]:
res = f_eval(
    np.array([source_word2ind['<s>']] + [source_word2ind[x] for x in train_src_batch[j]] + [source_word2ind['</s>']]).astype(np.int32), 
    np.array([target_word2ind['<s>']] + [target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
).argmax(axis=1)

In [110]:
print ' '.join([target_ind2word[x] for x in res])

’ l&apos; ’ alternative n ’ est pas sûr . . </s>


In [111]:
yy = np.array([target_word2ind['<s>']] + [target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32)
print ' '.join([target_ind2word[x] for x in yy])

<s> naturellement , l ’ alternative n ’ est pas plus sûre


In [42]:
def bleu_stats(hypothesis, reference):
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in xrange(1,5):
        s_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in xrange(len(hypothesis)+1-n)])
        r_ngrams = Counter([tuple(reference[i:i+n]) for i in xrange(len(reference)+1-n)])
        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis)+1-n, 0]))
    return stats

In [43]:
def bleu(stats):
    if len(filter(lambda x: x==0, stats)) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum([math.log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4.
    return math.exp(min([0, 1-float(r)/c]) + log_bleu_prec)

In [46]:
def get_validation_bleu(hypotheses):
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, dev_sentences_fr):
        stats += np.array(bleu_stats(hyp, ref))
    return "%.2f" % (100*bleu(stats))