In [1]:
import os
import sys
import numpy as np
import theano.tensor as T
import codecs
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu2")

Using gpu device 2: GeForce GTX TITAN X (CNMeM is disabled, CuDNN 4004)


In [2]:
sys.path.append('/usr0/home/glample/Research/IncrementalMT/SanDeepLearn/')

In [3]:
from recurrent import LSTM
from layer import FullyConnectedLayer, EmbeddingLayer
from optimizers import Optimizer

In [4]:
path_to_oracle = '/usr0/home/glample/Research/IncrementalMT/oracle.tsv'

In [5]:
lines = [line.strip().split(' ||| ') for line in codecs.open(path_to_oracle, 'r', encoding='utf8')]

In [74]:
sentences = []
curr_sentence_pointer = 0
for ind, line in enumerate(lines):
    if len(line) == 1 and line[0] == '':
        sentences.append(lines[curr_sentence_pointer:ind])
        curr_sentence_pointer = ind + 1

In [76]:
source_words = set()
for sentence in sentences:
    source_sentence = sentence[0][0]
    for word in source_sentence.split():
        source_words.add(word)
source_words.add('<s>')
source_words.add('</s>')
source_words.add('$NONE$')
source_words.add('<ss>')
source_words.add('</se>')
source_words.add('<fcs>')
source_words.add('</fcs>')

In [77]:
target_words = set()
for sentence in sentences:
    target_sentence = sentence[1][0]
    for word in target_sentence.split():
        target_words.add(word)
target_words.add('<s>')
target_words.add('</s>')
target_words.add('$NONE$')

In [78]:
for sentence in sentences:
    for state in sentence[2:]:
        assert len(state) == 5

In [100]:
source_word2ind = {word:ind for ind, word in enumerate(source_words)}
source_ind2word = {ind:word for ind, word in enumerate(source_words)}
target_word2ind = {word:ind for ind, word in enumerate(target_words)}
target_ind2word = {ind:word for ind, word in enumerate(target_words)}

In [101]:
'''
source_word2ind['<s>'] = len(source_word2ind)
source_word2ind['</s>'] = len(source_word2ind) + 1
source_word2ind['$NONE$'] = len(source_word2ind) + 2
source_word2ind['<ss>'] = len(source_word2ind) + 3
source_word2ind['</se>'] = len(source_word2ind) + 4
source_word2ind['<fcs>'] = len(source_word2ind) + 5
source_word2ind['</fce>'] = len(source_word2ind) + 6

target_word2ind['<s>'] = len(target_word2ind)
target_word2ind['</s>'] = len(target_word2ind) + 1
target_word2ind['$NONE$'] = len(target_word2ind) + 2

source_ind2word[len(source_word2ind)] = '<s>'
source_ind2word[len(source_word2ind) + 1] = '</s>'
source_ind2word[len(source_word2ind) + 2] = '$NONE$'
source_ind2word[len(source_word2ind) + 3] = '<ss>'
source_ind2word[len(source_word2ind) + 4] = '</se>'
source_ind2word[len(source_word2ind) + 5] = '<fcs>'
source_ind2word[len(source_word2ind) + 6] = '</fce>'

target_ind2word[len(target_word2ind)] = '<s>' 
target_ind2word[len(target_word2ind) + 1] = '</s>' 
target_ind2word[len(target_word2ind) + 2] = '$NONE$'
'''

"\nsource_word2ind['<s>'] = len(source_word2ind)\nsource_word2ind['</s>'] = len(source_word2ind) + 1\nsource_word2ind['$NONE$'] = len(source_word2ind) + 2\nsource_word2ind['<ss>'] = len(source_word2ind) + 3\nsource_word2ind['</se>'] = len(source_word2ind) + 4\nsource_word2ind['<fcs>'] = len(source_word2ind) + 5\nsource_word2ind['</fce>'] = len(source_word2ind) + 6\n\ntarget_word2ind['<s>'] = len(target_word2ind)\ntarget_word2ind['</s>'] = len(target_word2ind) + 1\ntarget_word2ind['$NONE$'] = len(target_word2ind) + 2\n\nsource_ind2word[len(source_word2ind)] = '<s>'\nsource_ind2word[len(source_word2ind) + 1] = '</s>'\nsource_ind2word[len(source_word2ind) + 2] = '$NONE$'\nsource_ind2word[len(source_word2ind) + 3] = '<ss>'\nsource_ind2word[len(source_word2ind) + 4] = '</se>'\nsource_ind2word[len(source_word2ind) + 5] = '<fcs>'\nsource_ind2word[len(source_word2ind) + 6] = '</fce>'\n\ntarget_ind2word[len(target_word2ind)] = '<s>' \ntarget_ind2word[len(target_word2ind) + 1] = '</s>' \ntarget_

In [102]:
stack_input = T.ivector()
forward_context_input = T.ivector()
target_input = T.ivector()
action_prediction = T.scalar()
target_output = T.ivector()

In [103]:
# Generate synthetic data to test dimensions
syn_stack_input = np.random.randint(low=0, high=len(source_word2ind), size=(5,)).astype(np.int32)
#syn_stack_input = np.array([61699, 49379, 61701]).astype(np.int32)
syn_forward_context_input = np.random.randint(low=0, high=len(source_word2ind), size=(6,)).astype(np.int32)
syn_action_prediction = np.random.randint(low=0, high=2, size=(1,)).astype(np.float32)[0]
syn_target_input = np.random.randint(low=0, high=len(target_word2ind), size=(7,)).astype(np.int32)
syn_target_output = np.random.randint(low=0, high=len(target_word2ind), size=(7,)).astype(np.int32)

In [104]:
# Neural Inventory

# Model Parameters
src_emb_dim      = 256  # source word embedding dimension
tgt_emb_dim      = 256  # target word embedding dimension
src_lstm_hid_dim = 512  # source LSTMs hidden dimension
tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
beta = 500 # RNN regularization parameter

# Embedding Layers
source_embedding = EmbeddingLayer(input_dim=len(source_word2ind) + 1, output_dim=src_emb_dim)
target_embedding = EmbeddingLayer(input_dim=len(target_word2ind) + 1, output_dim=tgt_emb_dim)

# Stack BiLSTM
stack_source_lstm_forward = LSTM(input_dim=src_emb_dim, output_dim=src_lstm_hid_dim, name='source_stack_lstm_forward')
stack_source_lstm_backward = LSTM(input_dim=src_emb_dim, output_dim=src_lstm_hid_dim, name='source_stack_lstm_backward')

# Forward Context BiLSTM
forward_context_lstm_forward = LSTM(input_dim=src_emb_dim, output_dim=src_lstm_hid_dim, name='source_forward_context_lstm')
forward_context_lstm_backward = LSTM(input_dim=src_emb_dim, output_dim=src_lstm_hid_dim, name='source_forward_context_lstm')

# Target Language LSTM Decoder
target_lstm = LSTM(input_dim = 2 * tgt_emb_dim, output_dim=tgt_lstm_hid_dim, name='target_lstm')

# Action prediction Layer
action_prediction_weights = FullyConnectedLayer(input_dim=2 * stack_source_lstm_forward.output_dim + 2 * forward_context_lstm_forward.output_dim, output_dim=1)

# Target Language Word Prediction Layer
target_word_decoding_weights = FullyConnectedLayer(input_dim=tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, output_dim=len(target_word2ind), activation='softmax')

# Layer to project the Encoder representation to the same size as h_0 of the decoder LSTM
projection_weights = FullyConnectedLayer(input_dim = 2 * src_lstm_hid_dim, output_dim = tgt_emb_dim, activation='tanh')

# Set model parameters
action_params = source_embedding.params + stack_source_lstm_forward.params + stack_source_lstm_backward.params + forward_context_lstm_forward.params + forward_context_lstm_backward.params + action_prediction_weights.params
seq_seq_params = source_embedding.params + stack_source_lstm_forward.params + stack_source_lstm_backward.params + target_embedding.params + target_lstm.params[:-1] + projection_weights.params + target_word_decoding_weights.params

In [106]:
# =====================================================
# The computational graph for this method
# =====================================================

# Get the embedding matrices seq_len x embdding_dim
stack_embedding_matrix = source_embedding.fprop(stack_input)
print 'stack_embedding_matrix', stack_embedding_matrix.eval({stack_input:syn_stack_input}).shape
forward_context_embedding_matrix = source_embedding.fprop(forward_context_input)

# ===========================================================
# Get LSTM representations of the stack, forward context
# ===========================================================

# Stack Representation
stack_source_lstm_forward.fprop(stack_embedding_matrix)
stack_source_lstm_backward.fprop(stack_embedding_matrix[::-1, :])
stack_lstm_representation = T.concatenate((stack_source_lstm_forward.h, stack_source_lstm_backward.h[::-1,:]), axis=1)
print 'stack_lstm_representation', stack_lstm_representation.eval({stack_input:syn_stack_input}).shape

# Forward Context Representation
forward_context_lstm_forward.fprop(forward_context_embedding_matrix)
forward_context_lstm_backward.fprop(forward_context_embedding_matrix[::-1, :])
forward_context_representation = T.concatenate((forward_context_lstm_forward.h, forward_context_lstm_backward.h[::-1,:]), axis=1)
print 'forward_context_lstm_representation', forward_context_representation.eval({forward_context_input:syn_forward_context_input}).shape

# Concatenate representations and make a prediction about what action to take
concatenated_representation = T.concatenate((stack_lstm_representation[-1], forward_context_representation[-1]))
print 'concatenated_representation', concatenated_representation.eval({stack_input:syn_stack_input, forward_context_input:syn_forward_context_input}).shape
prediction = action_prediction_weights.fprop(concatenated_representation).mean() # .mean() is a hack to make it a scalar (since its only a single value, it shouldn't matter)
print 'predicton', prediction.eval({stack_input:syn_stack_input, forward_context_input:syn_forward_context_input})

# Compute squared-error loss between predicted action and gold action
action_prediction_loss = ((action_prediction - prediction) ** 2).mean()
print 'action_prediction_loss', action_prediction_loss.eval({stack_input:syn_stack_input, forward_context_input:syn_forward_context_input, action_prediction:syn_action_prediction})

# Get target input embeddings
target_embeddings = target_embedding.fprop(target_input)
print 'target_embeddings', target_embeddings.eval({target_input:syn_target_input}).shape

target_lstm.h_0 = stack_lstm_representation[-1]
print 'target_lstm_h0', target_lstm.h_0.eval({stack_input:syn_stack_input}).shape
repeated_src_context = T.repeat(stack_lstm_representation[-1].dimshuffle('x', 0), target_embeddings.shape[0], axis=0)
print 'repeated_src_context', repeated_src_context.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape
repeated_src_context = projection_weights.fprop(repeated_src_context)
print 'repeated_src_context', repeated_src_context.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape
target_embeddings = T.concatenate((target_embeddings, repeated_src_context), axis=1)
print 'target_embeddings', target_embeddings.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape
target_lstm.fprop(target_embeddings)
print 'target_lstm', target_lstm.h.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape

transition = target_lstm.h.dot(stack_lstm_representation.transpose())
transition = transition.dot(stack_lstm_representation)
print 'transition', transition.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape

transition_last = T.concatenate([transition, target_lstm.h], axis=1)
decoded_words = target_word_decoding_weights.fprop(transition_last)
print 'decoded_words', decoded_words.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape
decoding_loss = T.nnet.categorical_crossentropy(decoded_words, target_output).mean()
decoding_loss += beta * T.mean((target_lstm.h[:-1] ** 2 - target_lstm.h[1:] ** 2) ** 2) # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf

print 'decoding_loss', decoding_loss.eval({stack_input:syn_stack_input, target_input:syn_target_input, target_output:syn_target_output})

'''
# Appened source representation to input at every step
repeated_src_context = T.repeat(stack_lstm_representation[-1].dimshuffle('x', 0), target_embeddings.shape[0], axis=0)
#repeated_src_context = proj_layer2.link(repeated_src_context)
target_embeddings = T.concatenate((target_embeddings, repeated_src_context), axis=1)

# Get the decoded sentence by connecting Encoder & Decoder
connection = projection_weights.link(stack_lstm_representation[-1])
print 'connection', connection.eval({stack_input:syn_stack_input}).shape
target_lstm.h_0 = connection
target_lstm.link(target_embeddings)
print 'target_lstm', target_lstm.h.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape

transition = target_lstm.h.dot(stack_lstm_representation.transpose())
transition = transition.dot(stack_lstm_representation)
print 'transition', transition.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape

# Decode words
decoded_words = target_word_decoding_weights.link(target_lstm.h)
print 'decoded_words', decoded_words.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape

# Compute seq-seq loss
decoding_loss = T.nnet.categorical_crossentropy(decoded_words, target_output).mean()
'''

stack_embedding_matrix [[ 0.32862642 -0.50916559  0.2742027  ..., -0.89670622 -0.53259957
  -0.68092984]
 [ 0.97198701 -0.53723347 -0.80460501 ..., -0.81999165 -0.1520461  -0.139009  ]
 [-0.32630369 -0.38549203  0.49874866 ..., -0.73710901 -0.41236058
  -0.32732522]
 [-0.10473968 -0.67169613  0.55053246 ...,  0.66723001 -0.58391631
  -0.68664545]
 [-0.42100394  0.77031434 -0.56798548 ..., -0.0388076   0.36613047
   0.29591563]]
stack_lstm_representation (5, 1024)
forward_context_lstm_representation (6, 1024)
concatenated_representation (2048,)
predicton 0.499748587608
action_prediction_loss 0.249748647213
target_embeddings (7, 256)
target_lstm_h0 (1024,)
repeated_src_context (7, 1024)
repeated_src_context (7, 256)
target_embeddings (7, 512)
target_lstm (7, 1024)
transition (7, 1024)
decoded_words (7, 71180)
decoding_loss 11.2579612732


"\n# Appened source representation to input at every step\nrepeated_src_context = T.repeat(stack_lstm_representation[-1].dimshuffle('x', 0), target_embeddings.shape[0], axis=0)\n#repeated_src_context = proj_layer2.link(repeated_src_context)\ntarget_embeddings = T.concatenate((target_embeddings, repeated_src_context), axis=1)\n\n# Get the decoded sentence by connecting Encoder & Decoder\nconnection = projection_weights.link(stack_lstm_representation[-1])\nprint 'connection', connection.eval({stack_input:syn_stack_input}).shape\ntarget_lstm.h_0 = connection\ntarget_lstm.link(target_embeddings)\nprint 'target_lstm', target_lstm.h.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape\n\ntransition = target_lstm.h.dot(stack_lstm_representation.transpose())\ntransition = transition.dot(stack_lstm_representation)\nprint 'transition', transition.eval({stack_input:syn_stack_input, target_input:syn_target_input}).shape\n\n# Decode words\ndecoded_words = target_word_decoding_we

In [107]:
'''
Update parameters using ADAM
'''
updates_action = Optimizer(clip=5.0).adam(action_prediction_loss, action_params)

In [108]:
'''
Update parameters using ADAM
'''
updates_seq_seq = Optimizer(clip=5.0).adam(decoding_loss, seq_seq_params)

In [109]:
f_train_action = theano.function(
    inputs=[stack_input, forward_context_input, action_prediction],
    outputs=action_prediction_loss,
    updates=updates_action
)

In [123]:
f_train_seq_seq = theano.function(
    inputs=[stack_input, target_input, target_output],
    outputs=decoding_loss,
    updates=updates_seq_seq
)

In [111]:
f_get_action = theano.function(
    inputs=[stack_input, forward_context_input],
    outputs=prediction,
)

In [122]:
f_eval_seq_seq = theano.function(
    inputs=[stack_input, target_input],
    outputs=decoded_words,
)

In [None]:
for ind, sentence in enumerate(sentences):
    action_losses = []
    seq_seq_losses = []
    for state in sentence[2:]:
        word = state[0]
        action = state[1]
        stack_state = state[2].strip().split()
        forward_context = state[3].strip().split()
        translation = state[4].strip().split()
        stack_words = [source_word2ind['<ss>']] + [source_word2ind[word] for word in stack_state] + [source_word2ind['</se>']]
        forward_context_words = [source_word2ind['<fcs>']] + [source_word2ind[word] for word in forward_context][::-1] + [source_word2ind['</fcs>']]
        action = 1.0 if action == 'T' else 0.0
        translation_words = [target_word2ind['<s>']] + [target_word2ind[word] for word in translation] + [target_word2ind['</s>']]
        action_loss = f_train_action(stack_words, forward_context_words, action)
        seq_seq_loss = f_train_seq_seq(stack_words, translation_words[:-1], translation_words[1:])
        action_losses.append(action_loss)
        seq_seq_losses.append(seq_seq_loss)
    print 'Sentence %d out of %d action loss %f ' % (ind, len(sentences), np.mean(action_losses))
    print 'Sentence %d out of %d seq-seq loss %f ' % (ind, len(sentences), np.mean(seq_seq_losses))

Sentence 0 out of 155362 action loss 0.105259 
Sentence 0 out of 155362 seq-seq loss 9.160011 
Sentence 1 out of 155362 action loss 0.238035 

In [115]:
f_get_action(stack_words, forward_context_words)

array(0.7847856879234314, dtype=float32)

In [117]:
stack_state

[u'the']

In [119]:
forward_context

[u'European',
 u'Parliament',
 u',',
 u'but',
 u'only',
 u'29',
 u'%',
 u'trust',
 u'the',
 u'French',
 u'parliament',
 u';',
 u'while',
 u'the',
 u'corresponding',
 u'figures',
 u'for',
 u'Germany',
 u'are',
 u'51',
 u'%',
 u'and',
 u'23',
 u'%',
 u'.']

In [57]:
forward_context_words

[61703,
 636,
 33199,
 28762,
 26617,
 19920,
 42227,
 61024,
 30863,
 52182,
 10935,
 32756,
 46977,
 27599,
 23117,
 774,
 59432,
 41971,
 23990,
 61705]

In [151]:
print translation.strip().split()

AttributeError: 'list' object has no attribute 'strip'

In [25]:
translation_words[1:]

[71181, 71179]

3