In [1]:
import theano
import theano.tensor as T
import numpy as np
import scipy.io as sio
import sys
import codecs
from collections import Counter
import math
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu0")
import copy

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, CuDNN 4007)


In [2]:
sys.path.append('/usr1/home/ssandeep/IncrementalMT/SanDeepLearn/')

In [3]:
from recurrent import LSTM, FastLSTM
from layer import FullyConnectedLayer, EmbeddingLayer
from optimizers import Optimizer

  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [4]:
path_to_train_src = 'data/train.src'
path_to_train_tgt = 'data/train.tgt'
path_to_dev_src = 'data/dev.src'
path_to_dev_tgt = 'data/dev.tgt'
path_to_test_src = 'data/test.src'

In [5]:
train_src = [line.strip().split() for line in codecs.open(path_to_train_src, 'r', encoding='utf8')]
train_tgt = [line.strip().split() for line in codecs.open(path_to_train_tgt, 'r', encoding='utf8')]
dev_src = [line.strip().split() for line in codecs.open(path_to_dev_src, 'r', encoding='utf8')]
dev_tgt = [line.strip().split() for line in codecs.open(path_to_dev_tgt, 'r', encoding='utf8')]
test_src = [line.strip().split() for line in codecs.open(path_to_test_src, 'r', encoding='utf8')]

In [6]:
src_vocab = set()
for line in train_src:
    for word in line:
        src_vocab.add(word)

src_word2ind = {}
src_ind2word = {}

for ind, word in enumerate(src_vocab):
    src_word2ind[word] = ind
    src_ind2word[ind] = word

In [7]:
target_vocab = set()
for line in train_tgt:
    for word in line:
        target_vocab.add(word)

target_word2ind = {}
target_ind2word = {}

for ind, word in enumerate(target_vocab):
    target_word2ind[word] = ind
    target_ind2word[ind] = word

In [8]:
src_inp = T.ivector()
tgt_inp = T.ivector()
tgt_op = T.ivector()
index = T.scalar()
#src_lengths = T.ivector()
#tgt_mask = T.fmatrix()

In [9]:
src_inp_t = np.random.rand(5,).astype(np.int32)
tgt_inp_t = np.random.rand(5,).astype(np.int32)
tgt_op_t = np.random.rand(5,).astype(np.int32)

In [41]:
#
# Model
#
src_emb_dim      = 256  # source word embedding dimension
tgt_emb_dim      = 256  # target word embedding dimension
src_lstm_op_dim = 512  # source LSTMs hidden dimension
tgt_lstm_op_dim = 2 * src_lstm_op_dim  # target LSTM hidden dimension
beta = 500 # Regularization coefficient


n_src = len(src_word2ind)  # number of words in the source language
n_tgt = len(target_word2ind)  # number of words in the target language

# Embedding Lookup Tables
src_embedding_layer = EmbeddingLayer(input_dim=n_src, output_dim=src_emb_dim, name='src_embedding')
tgt_embedding_layer = EmbeddingLayer(input_dim=n_tgt, output_dim=tgt_emb_dim, name='tgt_embedding')

# Encoder BiLSTM and Decoder LSTM
src_lstm_forward = LSTM(input_dim=src_emb_dim, output_dim=src_lstm_op_dim)
src_lstm_backward = LSTM(input_dim=tgt_emb_dim, output_dim=src_lstm_op_dim)
tgt_lstm = LSTM(input_dim=tgt_emb_dim, output_dim=tgt_lstm_op_dim)

# Projection layers
proj_layer1 = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=n_tgt, activation='softmax')
proj_layer2 = FullyConnectedLayer(input_dim=2 * src_lstm_op_dim, output_dim=tgt_emb_dim, activation='tanh')

params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer1.params # + proj_layer2.params

In [39]:
print src_lstm_forward.input_dim, src_lstm_forward.output_dim, tgt_lstm.output_dim, tgt_lstm.output_dim 

256 512 1024 1024


In [42]:
#Get embedding matrices
src_emb_inp = src_embedding_layer.fprop(src_inp)
print 'source embedding', src_emb_inp.eval({src_inp:src_inp_t}).shape
tgt_emb_inp = tgt_embedding_layer.fprop(tgt_inp)
print 'target embedding', tgt_emb_inp.eval({tgt_inp:tgt_inp_t}).shape

# Get BiLSTM representations
src_lstm_forward.fprop(src_emb_inp)
src_lstm_backward.fprop(src_emb_inp[::-1, :])
encoder_representation = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1)
print 'src lstm forward', src_lstm_forward.h.eval({src_inp:src_inp_t}).shape
print 'src lstm backward', src_lstm_backward.h.eval({src_inp:src_inp_t}).shape
print 'bilstm', encoder_representation.eval({src_inp:src_inp_t}).shape

# Get Target LSTM representation & Attention Vectors
tgt_lstm.h_0 = encoder_representation[-1]
tgt_lstm.fprop(tgt_emb_inp)
#repeated_src_context = T.repeat(encoder_representation[-1].dimshuffle('x', 0), tgt_emb_inp.shape[0], axis=0)
#repeated_src_context = proj_layer2.fprop(repeated_src_context)
#print 'repeated src_context', repeated_src_context.eval({src_inp: src_inp_t, tgt_inp: tgt_inp_t}).shape
#tgt_sentence_emb = T.concatenate((tgt_emb_inp, repeated_src_context), axis=1)

# Attention
attention = tgt_lstm.h.dot(encoder_representation.transpose())
attention = attention.dot(encoder_representation)
print 'tgt lstm', tgt_lstm.h.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape
print 'attention vectors', attention.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape

# Concatenate the attention vectors to the Target LSTM output before predicting the next word
target_representation = T.concatenate([attention, tgt_lstm.h], axis=1)
print 'taget representaiton', target_representation.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape

# Predict the output sequence of words
proj_output_rep = proj_layer1.fprop(target_representation)
print 'proj rep', proj_output_rep.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape

# Compute cost
cost = T.nnet.categorical_crossentropy(proj_output_rep, tgt_op).mean()
cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf
print 'cost', cost.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t, tgt_op:tgt_op_t})

source embedding (5, 256)
target embedding (5, 256)
src lstm forward (5, 512)
src lstm backward (5, 512)
bilstm (5, 1024)
tgt lstm (5, 1024)
attention vectors (5, 1024)
taget representaiton (5, 2048)
proj rep (5, 3121)
cost 8.5069694519


In [44]:
updates=Optimizer(clip=5.0).adam(
    cost=cost,
    params=params
)

In [47]:
f_train = theano.function(
    inputs=[src_inp, tgt_inp, tgt_op],
    outputs=cost,
    updates=updates
)

In [49]:
f_eval = theano.function(
    inputs=[src_inp, tgt_inp],
    outputs=proj_output_rep,
)

In [50]:
def get_batch(src_sents, tgt_sents, valid=False):
    assert len(src_sents) == len(tgt_sents)
    src_lengths = [len(sent) for sent in src_sents]
    src_max_len = max(src_lengths)
    if valid == False:
        tgt_lengths = [len(sent) for sent in tgt_sents]
        tgt_max_len = max(tgt_lengths)
    return (
        np.array([[src_word2ind[x] for x in sent] + ([0] * (src_max_len - len(sent))) for sent in src_sents]).astype(np.int32),
        np.array(src_lengths).astype(np.int32),
        np.array([[target_word2ind[x] for x in sent[:-1]] + ([0] * (tgt_max_len - len(sent))) for sent in tgt_sents]).astype(np.int32),
        np.array([[target_word2ind[x] for x in sent[1:]] + ([0] * (tgt_max_len - len(sent))) for sent in tgt_sents]).astype(np.int32),
        [([1] * (l - 1)) + ([0] * (tgt_max_len - l)) for l in tgt_lengths]
    )

In [51]:
def get_validation_predictions():
    validation_predictions = []    
    for ind, sent in enumerate(dev_src):
        
        if ind % 300 == 0:
            print ind, len(dev_src)
        src_words = np.array([src_word2ind[x] for x in sent]).astype(np.int32)
        current_outputs = [target_word2ind['<s>']]

        while True:
            next_word = f_eval(src_words, current_outputs).argmax(axis=1)[-1]
            current_outputs.append(next_word)
            #print [target_ind2word[x] for x in current_outputs]
            if next_word == target_word2ind['</s>'] or len(current_outputs) >= 15:
                validation_predictions.append([target_ind2word[x] for x in current_outputs])
                break
    return validation_predictions

In [52]:
def get_test_predictions():
    test_predictions = []    
    for ind, sent in enumerate(test_src):
        
        if ind % 300 == 0:
            print ind, len(test_src)
        src_words = np.array([src_word2ind[x] for x in sent]).astype(np.int32)
        current_outputs = [target_word2ind['<s>']]

        while True:
            next_word = f_eval(src_words, current_outputs).argmax(axis=1)[-1]
            current_outputs.append(next_word)
            #print [target_ind2word[x] for x in current_outputs]
            if next_word == target_word2ind['</s>'] or len(current_outputs) >= 15:
                test_predictions.append([target_ind2word[x] for x in current_outputs])
                break
    return test_predictions

In [53]:
test_preds = get_test_predictions()

0 506


KeyboardInterrupt: 

In [35]:
validation_preds = get_validation_predictions()

0 1006
300 1006
600 1006
900 1006


In [36]:
get_validation_bleu(valid_preds)

'27.43'

In [None]:
f = open('blue_valid_log.txt', 'w')
all_costs = []
batch_size = 50
n_epochs = 100
best_valid_preds = None
best_valid_score = -sys.maxint
best_test_preds = None
for i in xrange(n_epochs):
    print 'Starting epoch %i' % i
    indices = range(len(train_src))
    np.random.shuffle(indices)
    train_src_batch = [train_src[ind] for ind in indices]
    train_tgt_batch = [train_tgt[ind] for ind in indices]
    assert len(train_src_batch) == len(train_tgt_batch)
    costs = []
    for j in xrange(len(train_src_batch)):
        #s_sent, s_length, t_inp, t_op, mask = get_batch(train_src_batch[j:j + batch_size], train_tgt_batch[j:j+batch_size])
        new_cost = f_train(
            np.array([src_word2ind[x] for x in train_src_batch[j]]).astype(np.int32),
            np.array([target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
            np.array([target_word2ind[x] for x in train_tgt_batch[j]][1:]).astype(np.int32),
        )
        all_costs.append((j, new_cost))
        costs.append(new_cost)
        if j % 300 == 0:
            print j, np.mean(costs)
            costs = []
        if np.isnan(new_cost):
            print 'NaN detected.'
            break
        if j % 10000 == 0 and j != 0:
            valid_preds = get_validation_predictions()
            print '==================================================================='
            print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
            print '==================================================================='
            if float(get_validation_bleu(valid_preds)) >= best_valid_score:
                best_valid_score = float(get_validation_bleu(valid_preds))
                best_valid_preds = copy.deepcopy(valid_preds)
                best_test_preds = copy.deepcopy(get_test_predictions())
                print 'Found new best validation score %f ' % (best_valid_score)
            f.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, get_validation_bleu(valid_preds)))

    if np.isnan(new_cost):
        print 'NaN detected.'
        break
    valid_preds = get_validation_predictions()
    print '==================================================================='
    print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
    print '==================================================================='
f.close()

Starting epoch 0
0 0.319494
300

In [63]:
f = open('output.txt', 'w')
for line in best_test_preds:
    f.write(' '.join(line) + '\n')
f.close()

In [98]:
get_validation_bleu(valid_preds)

'28.21'

In [55]:
def bleu_stats(hypothesis, reference):
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in xrange(1,5):
        s_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in xrange(len(hypothesis)+1-n)])
        r_ngrams = Counter([tuple(reference[i:i+n]) for i in xrange(len(reference)+1-n)])
        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis)+1-n, 0]))
    return stats

In [56]:
def bleu(stats):
    if len(filter(lambda x: x==0, stats)) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum([math.log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4.
    return math.exp(min([0, 1-float(r)/c]) + log_bleu_prec)

In [57]:
def get_validation_bleu(hypotheses):
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    ref_lines = [line.strip().split() for line in open(path_to_dev_tgt, 'r')]
    for hyp, ref in zip(hypotheses, ref_lines):
        stats += np.array(bleu_stats(hyp, ref))
    return "%.2f" % (100*bleu(stats))