In [1]:
import theano
import theano.tensor as T
import numpy as np
import scipy.io as sio
import sys
import codecs
from collections import Counter
import math

In [2]:
sys.path.append('/Users/sandeepsubramanian/CMU/UltraDeep/')

In [3]:
from network import LSTM, FastLSTM
from layer import HiddenLayer, EmbeddingLayer
from learning_method import LearningMethod

In [4]:
path_to_train_src = 'data/train.src'
path_to_train_tgt = 'data/train.tgt'
path_to_dev_src = 'data/dev.src'
path_to_dev_tgt = 'data/dev.tgt'
path_to_test_src = 'data/test.src'

In [5]:
train_src = [line.strip().split() for line in codecs.open(path_to_train_src, 'r', encoding='utf8')]
train_tgt = [line.strip().split() for line in codecs.open(path_to_train_tgt, 'r', encoding='utf8')]
dev_src = [line.strip().split() for line in codecs.open(path_to_dev_src, 'r', encoding='utf8')]
dev_tgt = [line.strip().split() for line in codecs.open(path_to_dev_tgt, 'r', encoding='utf8')]

In [6]:
src_vocab = set()
for line in train_src:
    for word in line:
        src_vocab.add(word)

src_word2ind = {}
src_ind2word = {}

for ind, word in enumerate(src_vocab):
    src_word2ind[word] = ind
    src_ind2word[ind] = word

In [7]:
target_vocab = set()
for line in train_tgt:
    for word in line:
        target_vocab.add(word)

target_word2ind = {}
target_ind2word = {}

for ind, word in enumerate(target_vocab):
    target_word2ind[word] = ind
    target_ind2word[ind] = word

In [8]:
src_inp = T.ivector()
tgt_inp = T.ivector()
tgt_op = T.ivector()
#src_lengths = T.ivector()
#tgt_mask = T.fmatrix()

In [22]:
src_embedding_layer = EmbeddingLayer(input_dim=len(src_word2ind), output_dim=64)
tgt_embedding_layer = EmbeddingLayer(input_dim=len(target_word2ind), output_dim=64)
src_lstm_forward = LSTM(input_dim=src_embedding_layer.output_dim, hidden_dim=128, with_batch=False)
src_lstm_backward = LSTM(input_dim=src_embedding_layer.output_dim, hidden_dim=128, with_batch=False)
tgt_lstm = LSTM(input_dim=tgt_embedding_layer.output_dim, hidden_dim=2 * src_lstm_forward.hidden_dim, with_batch=False)
tgt_projection_layer = HiddenLayer(input_dim=tgt_lstm.hidden_dim * 2, output_dim=len(target_word2ind))




In [23]:
src_inp_t = np.random.rand(5,).astype(np.int32)
tgt_inp_t = np.random.rand(5,).astype(np.int32)
tgt_op_t = np.random.rand(5,).astype(np.int32)
#src_lengths_t = np.random.randint(0, 5, size=(10,)).astype(np.int32)
#tgt_mask_t = np.float32(np.random.rand(10, 5).astype(np.float32) > 0.5)

In [24]:
print src_lstm_forward.input_dim, src_lstm_forward.hidden_dim, tgt_lstm.input_dim, tgt_lstm.hidden_dim 

64 128 64 256


In [25]:
# Get embedding matrices
src_emb_inp = src_embedding_layer.link(src_inp)
print 'source embedding', src_emb_inp.eval({src_inp:src_inp_t}).shape
tgt_emb_inp = tgt_embedding_layer.link(tgt_inp)
print 'target embedding', tgt_emb_inp.eval({tgt_inp:tgt_inp_t}).shape

# Get BiLSTM representations
src_lstm_forward.link(src_emb_inp)
src_lstm_backward.link(src_emb_inp[::-1, :])
encoder_representation = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1)
print 'src lstm forward', src_lstm_forward.h.eval({src_inp:src_inp_t}).shape
print 'src lstm backward', src_lstm_backward.h.eval({src_inp:src_inp_t}).shape
print 'bilstm', encoder_representation.eval({src_inp:src_inp_t}).shape

# Get Target LSTM representation & Attention Vectors
tgt_lstm.h_0 = encoder_representation[-1]
tgt_lstm.link(tgt_emb_inp)
attention = tgt_lstm.h.dot(encoder_representation.transpose())
attention = attention.dot(encoder_representation)
print 'tgt lstm', tgt_lstm.h.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape
print 'attention vectors', attention.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape

# Concatenate the attention vectors to the Target LSTM output before predicting the next word
target_representation = T.concatenate([attention, tgt_lstm.h], axis=1)

# Predict each 
proj_output_rep = T.nnet.softmax(tgt_projection_layer.link(target_representation))
print 'proj rep', proj_output_rep.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t}).shape

source embedding (5, 64)
target embedding (5, 64)
src lstm forward (5, 128)
src lstm backward (5, 128)
bilstm (5, 256)
tgt lstm (5, 256)
attention vectors (5, 256)
proj rep (5, 3121)


In [26]:
'''
cost = - (T.log(proj_output_rep[
    T.arange(tgt_inp.shape[0]).dimshuffle(0, 'x').repeat(tgt_inp.shape[1], axis=1).flatten(),
    T.arange(tgt_inp.shape[1]).dimshuffle('x', 0).repeat(tgt_inp.shape[0], axis=0).flatten(),
    tgt_op.flatten()
]) * tgt_mask.flatten()).sum() / T.neq(tgt_mask, 0).sum()
print cost.eval({tgt_inp:tgt_inp_t, tgt_mask:tgt_mask_t, tgt_op:tgt_op_t, src_inp:src_inp_t, src_lengths:src_lengths_t})
'''

"\ncost = - (T.log(proj_output_rep[\n    T.arange(tgt_inp.shape[0]).dimshuffle(0, 'x').repeat(tgt_inp.shape[1], axis=1).flatten(),\n    T.arange(tgt_inp.shape[1]).dimshuffle('x', 0).repeat(tgt_inp.shape[0], axis=0).flatten(),\n    tgt_op.flatten()\n]) * tgt_mask.flatten()).sum() / T.neq(tgt_mask, 0).sum()\nprint cost.eval({tgt_inp:tgt_inp_t, tgt_mask:tgt_mask_t, tgt_op:tgt_op_t, src_inp:src_inp_t, src_lengths:src_lengths_t})\n"

In [27]:
cost = T.nnet.categorical_crossentropy(proj_output_rep, tgt_op).mean()
print cost.eval({tgt_inp:tgt_inp_t, src_inp:src_inp_t, tgt_op:tgt_op_t})

8.04578588964


In [29]:
params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + tgt_projection_layer.params

In [32]:
updates=LearningMethod(clip=5.0).get_updates('adam', cost, params)

In [33]:
f_train = theano.function(
    inputs=[src_inp, tgt_inp, tgt_op],
    outputs=cost,
    updates=updates
)

In [34]:
f_eval = theano.function(
    inputs=[src_inp, tgt_inp],
    outputs=proj_output_rep,
)

In [37]:
def get_batch(src_sents, tgt_sents, valid=False):
    assert len(src_sents) == len(tgt_sents)
    src_lengths = [len(sent) for sent in src_sents]
    src_max_len = max(src_lengths)
    if valid == False:
        tgt_lengths = [len(sent) for sent in tgt_sents]
        tgt_max_len = max(tgt_lengths)
    return (
        np.array([[src_word2ind[x] for x in sent] + ([0] * (src_max_len - len(sent))) for sent in src_sents]).astype(np.int32),
        np.array(src_lengths).astype(np.int32),
        np.array([[target_word2ind[x] for x in sent[:-1]] + ([0] * (tgt_max_len - len(sent))) for sent in tgt_sents]).astype(np.int32),
        np.array([[target_word2ind[x] for x in sent[1:]] + ([0] * (tgt_max_len - len(sent))) for sent in tgt_sents]).astype(np.int32),
        [([1] * (l - 1)) + ([0] * (tgt_max_len - l)) for l in tgt_lengths]
    )

In [38]:
def get_validation_predictions():
    validation_predictions = []    
    for ind, sent in enumerate(dev_src):
        
        if ind % 300 == 0:
            print ind, len(dev_src)
        src_words = np.array([src_word2ind[x] for x in sent]).astype(np.int32)
        current_outputs = [src_word2ind['<s>']]

        while True:
            next_word = f_eval(src_words, current_outputs).argmax(axis=1)[-1]
            current_outputs.append(next_word)
            if next_word == src_word2ind['</s>'] or len(current_outputs) >= 15:
                validation_predictions.append([target_ind2word[x] for x in current_outputs])
                break
    return validation_predictions

In [None]:
all_costs = []
batch_size = 50
n_epochs = 100
for i in xrange(n_epochs):
    print 'Starting epoch %i' % i
    indices = range(len(train_src))
    np.random.shuffle(indices)
    train_src_batch = [train_src[ind] for ind in indices]
    train_tgt_batch = [train_tgt[ind] for ind in indices]
    assert len(train_src_batch) == len(train_tgt_batch)
    costs = []
    for j in xrange(len(train_src_batch)):
        #s_sent, s_length, t_inp, t_op, mask = get_batch(train_src_batch[j:j + batch_size], train_tgt_batch[j:j+batch_size])
        new_cost = f_train(
            np.array([src_word2ind[x] for x in train_src_batch[j]]).astype(np.int32),
            np.array([target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
            np.array([target_word2ind[x] for x in train_tgt_batch[j]][1:]).astype(np.int32),
        )
        all_costs.append((j, new_cost))
        costs.append(new_cost)
        if j % 300 == 0:
            print j, np.mean(costs)
            costs = []
        if np.isnan(new_cost):
            print 'NaN detected.'
            break
        if j % 10000 == 0:
            valid_preds = get_validation_predictions()
            print '==================================================================='
            print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
            print '==================================================================='

    if np.isnan(new_cost):
        print 'NaN detected.'
        break
    valid_preds = get_validation_predictions()
    print '==================================================================='
    print 'Epoch %i BLEU on Validation : %s ' % (i, get_validation_bleu(valid_preds))
    print '==================================================================='

Starting epoch 0
0 8.04610935803
0 1006
300 1006
600

In [101]:
' '.join(valid_preds[1])

u"currency it 's a <unk> <unk> . </s> . </s> <unk> <unk> ? </s> ."

In [102]:
' '.join(dev_tgt[1])

u'<s> it is august fifteenth . </s>'

In [238]:
test_batch_size = 100

for i in xrange(0, len(dev_src), test_batch_size):
    src_lengths = np.array([len(x) for x in dev_src[i:i+test_batch_size]]).astype(np.int32)
    src_maxlen = np.max(src_lengths)
    src_inps = np.array([ [src_word2ind[x] for x in dev_src[j]] + ([0] * (src_maxlen - len(dev_src[j]))) for j in xrange(i, i + test_batch_size)]).astype(np.int32)
    current_outputs = [[target_word2ind['<s>']] for _ in xrange(len(features))]
    final_outputs = [None] * len(features)

    mapping = {j: j for j in xrange(len(features))}

    while len(current_outputs) > 0:
        to_delete = []
        next_words = f_eval(src_inps, src_lengths, current_outputs)[:, -1, :].argmax(axis=1)
        assert len(mapping) == len(next_words) == len(current_outputs)
        for j in xrange(len(next_words)):
            current_outputs[j].append(next_words[j])
            if next_words[j] == target_word2ind['</s>'] or len(current_outputs[j]) >= 20:
                final_outputs[mapping[j]] = current_outputs[j]
                to_delete.append(j)
        for j in sorted(to_delete)[::-1]:
            del features[j]
            del current_outputs[j]
            del mapping[j]
        new_index = 0
        for k in sorted(mapping.keys()):
            if k > new_index:
                mapping[new_index] = mapping[k]
                del mapping[k]
            new_index += 1

    assert all(final_outputs)
    break

In [59]:
xx = f_eval(np.array([src_word2ind[x] for x in train_src_batch[0]]).astype(np.int32), np.array([target_word2ind[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32)).argmax(axis=1)

In [60]:
[target_ind2word[_] for _ in xx]

[u'can', u"'s", u'<unk>', u'<unk>', u'time', u'for', u'</s>']

In [40]:
def bleu_stats(hypothesis, reference):
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in xrange(1,5):
        s_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in xrange(len(hypothesis)+1-n)])
        r_ngrams = Counter([tuple(reference[i:i+n]) for i in xrange(len(reference)+1-n)])
        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis)+1-n, 0]))
    return stats

In [41]:
def bleu(stats):
    if len(filter(lambda x: x==0, stats)) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum([math.log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4.
    return math.exp(min([0, 1-float(r)/c]) + log_bleu_prec)

In [42]:
def get_validation_bleu(hypotheses):
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    ref_lines = [line.strip().split() for line in open(path_to_dev_tgt, 'r')]
    for hyp, ref in zip(hypotheses, ref_lines):
        stats += np.array(bleu_stats(hyp, ref))
    return "%.2f" % (100*bleu(stats))