In [36]:
import theano
import theano.tensor as T
import numpy as np
import scipy.io as sio
import sys

In [37]:
sys.path.append('SanDeepLearn/')

In [38]:
import recurrent
from layer import EmbeddingLayer
from recurrent import RNN, LSTM
from utils import get_weights, get_bias
from optimizers import Optimizer
reload(recurrent)

<module 'recurrent' from 'SanDeepLearn/recurrent.pyc'>

In [39]:
path_to_data = 'Data/dev-test-train.de-en'

In [40]:
lines = [line.strip().split('|||') for line in open(path_to_data, 'r')]

In [41]:
german_lines, english_lines = [x[0].strip() for x in lines], [x[1].strip() for x in lines]

In [42]:
german_vocab = {}
inv_german_vocab = {}
vocab_ind = 0
for line in german_lines:
    line = line.split()
    for word in line:
        if word not in german_vocab:
            german_vocab[word] = vocab_ind
            inv_german_vocab[vocab_ind] = word
            vocab_ind += 1
german_vocab['<UNK>'] = vocab_ind
inv_german_vocab[vocab_ind] = '<UNK>'
german_vocab['<STOP>'] = vocab_ind + 1
inv_german_vocab[vocab_ind + 1] = '<STOP>'

In [43]:
english_vocab = {}
inv_english_vocab = {}
vocab_ind = 0
for line in english_lines:
    line = line.split()
    for word in line:
        if word not in english_vocab:
            english_vocab[word] = vocab_ind
            inv_english_vocab[vocab_ind] = word
            vocab_ind += 1
english_vocab['<UNK>'] = vocab_ind

In [71]:
enc_inp = T.ivector()
dec_inp = T.ivector()
expected_output = T.imatrix()
lr = 0.001

In [72]:
english_embedding = EmbeddingLayer(len(english_vocab), 100) # Randomly initialized word embeddings for english

In [73]:
german_embedding = EmbeddingLayer(len(german_vocab), 100) # Randomly initialized word embeddings for german

In [74]:
encoder_rnn = LSTM(100, 100, return_type='last') # Encoder RNN that returns the final activation for a sequence of english words (no start or stop)
decoder_rnn = LSTM(100, 100, return_type='all') # Decoder RNN that returns the activation at each step while decoding (includes a <STOP>)

In [75]:
'''
Initializes the weights that are needed to project the decoder activation at each step onto the vocabulary size
'''
input_dim = decoder_rnn.output_dim 
output_dim = len(german_vocab)
low = -4 * np.sqrt(6. / (input_dim + output_dim))
high = 4 * np.sqrt(6. / (input_dim + output_dim))
softmax_project_weights = get_weights(low=low, high=high, shape=(input_dim, output_dim), name='decoder_softmax_projection_weights')
softmax_project_bias = get_bias(output_dim, name='decoder_softmax_projection_bias')

In [76]:
source_embeddings = english_embedding.fprop(enc_inp[::-1]) # Transform sequence of indices into a matrix for the source sentence

In [77]:
encoder_rnn_output = encoder_rnn.fprop(source_embeddings) # Propogate the embedding matrix through the encoder RNN

In [78]:
target_embeddings = T.vertical_stack((encoder_rnn_output.reshape((1, -1))), german_embedding.fprop(dec_inp[:-1])) # Creates the embedding matrix to propogate through the decoder RNN need to reshape the encoder RNN activation into a matrix for Theano vertical stack :/

In [79]:
decoder_outputs = decoder_rnn.fprop(target_embeddings) # Propogate the target embedding matrix through the decoder RNN

In [80]:
decoder_predictions = T.nnet.softmax(T.dot(decoder_outputs, softmax_project_weights) + softmax_project_bias) # Project activations onto the target vocabury space

In [81]:
decoder_sentence = T.argmax(decoder_predictions, axis=1) # Get the sentence that corresponds to this projection by greedily picking max at each decoding step

In [82]:
loss = T.nnet.categorical_crossentropy(decoder_predictions, expected_output).mean() # Set the loss function to be the cross entropy between softmax output and expected output (expected output is one step ahead of the decoding at every step)

In [83]:
params = [softmax_project_weights, softmax_project_bias] + encoder_rnn.params + decoder_rnn.params + english_embedding.params + german_embedding.params # Set the model parameters

In [84]:
'''
Update parameters using SGD
'''
updates = Optimizer().sgd(
                loss,
                params,
                lr=lr
        )

In [85]:
'''
def lm_helper(decoder_input):
    decoder_rnn_embedding = decoder_rnn.fprop(decoder_input)
    decoder_rnn_prediction = T.argmax(T.nnet.softmax(T.dot(decoder_rnn_embedding, softmax_project_weights))).flatten()
    return T.concatenate((decoder_input, german_embedding.fprop(decoder_rnn_prediction)), axis=0), theano.scan_module.until(T.eq(decoder_rnn_prediction[0], german_vocab['<STOP>']))
'''

"\ndef lm_helper(decoder_input):\n    decoder_rnn_embedding = decoder_rnn.fprop(decoder_input)\n    decoder_rnn_prediction = T.argmax(T.nnet.softmax(T.dot(decoder_rnn_embedding, softmax_project_weights))).flatten()\n    return T.concatenate((decoder_input, german_embedding.fprop(decoder_rnn_prediction)), axis=0), theano.scan_module.until(T.eq(decoder_rnn_prediction[0], german_vocab['<STOP>']))\n"

In [86]:
'''
results, _ = theano.scan(
    fn=lm_helper,
    outputs_info=[decoder_input], 
    n_steps=100
)
'''

'\nresults, _ = theano.scan(\n    fn=lm_helper,\n    outputs_info=[decoder_input], \n    n_steps=100\n)\n'

In [87]:
#results_2, _ = theano.scan(
#    fn=lambda x: (x ** 2, theano.scan_module.until(T.eq(x, 10))),
#    outputs_info=[2],
#    n_steps=100
#)

In [88]:
#f_temp = theano.function(
#    inputs=[],
#    outputs=results_2
#)

In [89]:
# Gets the encoder representation of the sentence from the computational graph
f_encoder = theano.function(
    inputs=[enc_inp],
    outputs=encoder_rnn_output
)

In [90]:
# Gets the decoder softmax output 
f_decoder = theano.function(
    inputs=[enc_inp, dec_inp],
    outputs=decoder_predictions
)

In [91]:
# Trains the sequence-sequence model
f_train = theano.function(
    inputs=[enc_inp, dec_inp, expected_output],
    outputs=loss,
    updates=updates
)

In [92]:
# Gets the indices of the words in the decoded sentence
f_decoded_sentence = theano.function(
    inputs=[enc_inp, dec_inp],
    outputs = decoder_sentence
)

In [93]:
#english_sentence = english_lines[0].split()
#test_english_sentence = [english_vocab[word] for word in english_sentence]

In [94]:
#german_sentence = german_lines[0].split()
#german_sentence.append('<STOP>')
#test_german_sentence = [german_vocab[word] for word in german_sentence]
#true_german_sentence = np.vstack([np.zeros(len(german_vocab)) for word in german_sentence]).astype(np.int32)
#for ind, word in enumerate(german_sentence):
#    true_german_sentence[ind][german_vocab[word]] = 1

In [95]:
#f_decoder(test_english_sentence, test_german_sentence).shape

In [96]:
#print ' '.join([inv_german_vocab[word] for word in f_decoded_sentence(test_english_sentence, test_german_sentence)])
#f_train(test_english_sentence, test_german_sentence, true_german_sentence)
#print ' '.join([inv_german_vocab[word] for word in f_decoded_sentence(test_english_sentence, test_german_sentence)])

In [99]:
for i in range(10):
    for ind, (english_sentence, german_sentence) in enumerate(zip(english_lines, german_lines)):
        english_sentence = [english_vocab[word] for word in english_sentence.split()]
        german_sentence = [german_vocab[word] for word in german_sentence.split()]
        german_sentence.append(german_vocab['<STOP>'])
        true_german_sentence = np.vstack([np.zeros(len(german_vocab)) for word in german_sentence]).astype(np.int32)
        for ind, word in enumerate(german_sentence):
            true_german_sentence[ind][word] = 1
        loss = f_train(english_sentence, german_sentence, true_german_sentence)
        if ind % 10 == 0:
            decoded_sentence = [inv_german_vocab[word] for word in f_decoded_sentence(english_sentence, german_sentence)]
            print '========================================================================================='
            print 'Decoded sentence : %s ' % (' '.join(decoded_sentence))
            print 'Actual sentence : %s ' % (' '.join([inv_german_vocab[word] for word in german_sentence]))
            print 'Epoch : %d loss : %f' % (i, loss)
            print '========================================================================================='

Decoded sentence : Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme 
Actual sentence : ( Das Parlament erhebt sich zu einer Schweigeminute . ) <STOP> 
Epoch : 0 loss : nan
Decoded sentence : Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wiederaufnahme Wie

KeyboardInterrupt: 

In [100]:
inv_german_vocab[84952]

KeyError: 84952

In [28]:
encoder_representation = f_encoder(test_english_sentence)

In [29]:
decoder_prediciton = f_decoder(test_english_sentence, test_german_sentence)

In [30]:
decoder_prediciton.shape

(4, 84953)

In [79]:
print [inv_german_vocab[x] for x in decoder_prediciton], german_sentence

['-konsum', 'Migrantenfamilien', 'Darbietungen', 'Sch\xc3\xa4rfung'] ['Wiederaufnahme', 'der', 'Sitzungsperiode']


In [32]:
inv_german_vocab[104734]

KeyError: 104734

In [89]:
inv_german_vocab[decoder_prediciton_2[0]]

'Aktionspl\xc3\xa4ne'

In [90]:
inv_german_vocab[decoder_prediciton[0]]

'OCP-LP'

In [67]:
decoder_rnn_prediction

Flatten{1}.0

In [110]:
a = np.ones(3)

In [112]:
b = np.ones((3,3))

In [115]:
np.vstack((a, b))

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [36]:
T.repeat(T.vector(), 2).ndim

1

In [39]:
np.repeat([1,2], 2, axis=-1)

array([1, 1, 2, 2])

In [41]:
np.tile([1,2], (2,1))

array([[1, 2],
       [1, 2]])

In [45]:
a = np.ones(3)

In [47]:
np.reshape(a, (1,3))

array([[ 1.,  1.,  1.]])

In [50]:
x = np.random.rand(12)

In [53]:
x.reshape(1, -1).shape

(1, 12)