In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import torch.nn as nn

from copy import deepcopy

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
np.random.seed(0)

In [3]:
bob_train = open('bobsue-data/bobsue.seq2seq.train.tsv').read().lower().split('\n')[:-1]
bob_test = open('bobsue-data/bobsue.seq2seq.test.tsv').read().lower().split('\n')[:-1]
bob_dev = open('bobsue-data/bobsue.seq2seq.dev.tsv').read().lower().split('\n')[:-1]
voc = list(set(open('bobsue-data/bobsue.voc.txt').read().lower().split('\n')[:-1]))

In [4]:
# Set the training set
x_train = []; y_train = []
for p in bob_train:
    pair = p.split('\t')
    x_train.append(pair[0]); y_train.append(pair[1])

In [5]:
print("The size of the vocabulary dictionary is: {}".format(len(voc)))

The size of the vocabulary dictionary is: 1442


# Try the randomized initialization

In [6]:
# Impelent the uniform distribution assignment
torch.manual_seed(2)
voc_w2n = {}
num_rep = torch.FloatTensor(len(voc), 200).uniform_(-2, 2)
for i in range(len(num_rep)):
    voc_w2n[voc[i]] = num_rep[i]

x_train_num = []; y_train_num = []
for i in range(len(x_train)):
    x_sen = [voc_w2n[word] for word in x_train[i].split()]
    y_sen = [voc_w2n[word] for word in y_train[i].split()]
    x_train_num.append(x_sen); y_train_num.append(y_sen)

Comment: Here I tried different range of uniform distributions to initialize the random word embedding. I thought when the wider the range, the more distinguishable between different words, and the computation can avoid some numerical issues computing very small derivatives. However, this does not seem to help much with the termical outcomes. And I finally ended up using the (-2, 2) range.

In [7]:
class MyLSTM(nn.Module):
  
    def __init__(self, n_in, n_out):
        super(MyLSTM, self).__init__()
        self.n_in = n_in
        self.n_out = n_out
        self.fc_enco = nn.Linear(n_in+n_out, n_out)
        self.ic_enco = nn.Linear(n_in+n_out, n_out)
        self.oc_enco = nn.Linear(n_in+n_out, n_out)
        self.gc_enco = nn.Linear(n_in+n_out, n_out)
        self.fc_deco = nn.Linear(n_in+n_out, n_out)
        self.ic_deco = nn.Linear(n_in+n_out, n_out)
        self.oc_deco = nn.Linear(n_in+n_out, n_out)
        self.gc_deco = nn.Linear(n_in+n_out, n_out)

    def forward(self, c_old, h_old, x, mode):
        tensor = torch.cat([x, h_old])
        if mode=='encode':
            f = torch.sigmoid(self.fc_enco(tensor))
            i = torch.sigmoid(self.ic_enco(tensor))
            o = torch.sigmoid(self.oc_enco(tensor))
            g = torch.tanh(self.gc_enco(tensor))
            c = f*c_old + i*g
            h = o*torch.tanh(c)
        if mode=='decode':
            f = torch.sigmoid(self.fc_deco(tensor))
            i = torch.sigmoid(self.ic_deco(tensor))
            o = torch.sigmoid(self.oc_deco(tensor))
            g = torch.tanh(self.gc_deco(tensor))
            c = f*c_old + i*g
            h = o*torch.tanh(c)
        return c, h

In [8]:
sen_length = [len(sen) for sen in y_train_num]
print("The maximal length of all sentences is: {}".format(max(sen_length)))

The maximal length of all sentences is: 21


In [9]:
# I used the L2-Norm to compute the distance between words.
def dist(h, y):
    return torch.mean((h-y)**2)

def compute_loss(h_lst, y_sen):
    l_pred = len(h_lst); l_true = len(y_sen)
    dim = len(y_sen[0])
    if l_pred>=l_true:
        # Here I multiplied at the end by (l_pred/l_true)**2, which is a penalty for failure to detect sentence end.
        loss = sum([dist(h_lst[i], y_sen[i]) for i in range(l_true)])/l_true*((l_pred/l_true)**2)
    else:
        # When the predicted sentences are short than the ground true sentence, I made up the missing words by [0, ..., 0]
        h_lst = h_lst + [2*torch.ones(y_sen[0].shape)]*(l_true-l_pred)
        loss = sum([dist(h_lst[i], y_sen[i]) for i in range(l_true)])/l_true
    return loss


def find_closest(num_rep, h):
    dists = ((num_rep-h)**2).sum(1)
    return dists.min(0)[1]

I chose to use the L2-norm loss as the softmax loss is way too computationally intensive for my computer.

In [56]:
# Train the model with the training data set
hparams = {
    'learning_rate': 5,
    'epochs': 10,
}

model_lst = []
model = MyLSTM(200, 200)
opt = optim.SGD(model.parameters(), lr=hparams['learning_rate'], momentum=0.8, weight_decay=0.6)
epoch_losses = []
torch.manual_seed(1234)
np.random.seed(25400)
for i in range(hparams['epochs']):
    losses = []
    for j in np.random.choice(len(x_train_num), 1000):
        x_sen = x_train_num[j]
        y_sen = y_train_num[j][1:]
        c_old = torch.zeros(200)
        h_old = torch.zeros(200)
        for m in range(len(x_sen)):
            x = x_sen[m]
            c_old, h_old = model(c_old, h_old, x, 'encode')
        x = voc_w2n['<s>']; word = '<s>'
        h_lst = []; count = 0
        while count<22 and word!='</s>':
            c_old, h_old = model(c_old, h_old, x, 'decode')
            h_lst.append(h_old)
            idx = find_closest(num_rep, h_old)
            x = num_rep[idx]; word = voc[idx]
            count +=1
        loss = compute_loss(h_lst, y_sen)
        if j%200==0: opt.param_groups[0]['lr']=50 #make big jumps occasionally
        opt.zero_grad()
        loss.backward()
        opt.step()
        if j%200==0: opt.param_groups[0]['lr']=hparams['learning_rate']
        losses.append(loss)
    loss_mean = sum(losses)/len(losses)
    epoch_losses.append(loss_mean)
    model_lst.append(deepcopy(model))
    print('Epoch {:4} | Loss {:.3f}'.format(i, loss_mean))
    print("-----------------------------------------------")

Epoch    0 | Loss 7.786
-----------------------------------------------
Epoch    1 | Loss 7.913
-----------------------------------------------
Epoch    2 | Loss 7.729
-----------------------------------------------
Epoch    3 | Loss 8.024
-----------------------------------------------
Epoch    4 | Loss 8.082
-----------------------------------------------
Epoch    5 | Loss 8.018
-----------------------------------------------
Epoch    6 | Loss 7.725
-----------------------------------------------
Epoch    7 | Loss 7.937
-----------------------------------------------
Epoch    8 | Loss 7.655
-----------------------------------------------
Epoch    9 | Loss 8.027
-----------------------------------------------


I tried to implement more epochs, but it seems the RAM is running out of memory. But clearly, this does not work, the loss is not decreasing when averaging each epoch. (I inspected the first few initial losses associated with the first few sentence pairs, they are decreasing but stops going down very soon).

### Some commentes on tuning parameters:

Sorry I did not take down all the results of attempts tuning up the parameters. But I would like to discuss about how the performance varied with different hyperparametric settings:
* I tried various numbers of epochs. I guess it might be better to feed the model with a large enough training set size.
* One of the problems I ran into is that the algorithm will quick converge to some sequence of words. Then I was told that by inflating the learning parameter we that add more momentum to the gradient descent algorithm. However, my final output does not work at all, so I guess we better not keep the learning rate constantly smalle or large. The optimal strategy might be makine big jumps occasionally, so that we neither get stuck at local minimum nor miss the global minimum all the time. I then tried this strategy, as shown in my codes, and the situations seems to have get better, but still needs more attempts reach success.

## Predict the sentence

In [51]:
# Set the testing set as random generated representations
x_test = []; y_test = []
for p in bob_test:
    pair = p.split('\t')
    x_test.append(pair[0]); y_test.append(pair[1])

x_test_num = []; y_test_num = []
for i in range(len(x_test)):
    x_sen = [voc_w2n[word] for word in x_test[i].split()]
    y_sen = [voc_w2n[word] for word in y_test[i].split()]
    x_test_num.append(x_sen); y_test_num.append(y_sen)

In [66]:
def compute_accuracy(word_lst, y_sen):
    return np.mean([z[0]==z[1] for z in zip(word_lst, y_sen)])

model_trained = model_lst[6]
# Predict
accuracy_lst = []; pred_sen = []
for i in range(len(x_test)):
    x_sen = x_test_num[i]
    y_sen = y_test_num[i][1:]
    c_old = torch.zeros(200)
    h_old = torch.zeros(200)
    for m in range(len(x_sen)):
        x = x_sen[m]
        c_old, h_old = model_trained(c_old, h_old, x, 'encode')
    x = voc_w2n['<s>']; word = '<s>'
    word_lst = [word]; h_lst = []; count = 0
    while count<22 and word!='</s>':
        c_old, h_old = model_trained(c_old, h_old, x, 'decode')
        h_lst.append(h_old)
        idx = find_closest(num_rep, h_old)
        x = num_rep[idx]
        word = voc[idx]; word_lst.append(word)
        count +=1
    accuracy = compute_accuracy(word_lst, y_test[i].split())
    accuracy_lst.append(accuracy); pred_sen.append(" ".join(word_lst))

In [67]:
print("The predicted sentences")
pred_sen[-10:]

The predicted sentences


['<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line swimming tape line swimming hand',
 '<s> they line swimming hand line swimming seeing line swimming hand line swimming tape line swimming hand line

In [68]:
print("The ground true sentences")
y_test[-10:]

The ground true sentences


["<s> bob 's dad had his own basketball card when he was a professional . </s>",
 "<s> the picture didn 't look right . </s>",
 '<s> she was tired and decided to sleep . </s>',
 '<s> she bought bob a brand new bike ! </s>',
 '<s> bob really loved his job ! </s>',
 '<s> his grades got better . </s>',
 '<s> he loved it so much ! </s>',
 '<s> she then asked if other people thought this , they said no . </s>',
 '<s> the same day his wife sue bought a new chair for the living room . </s>',
 '<s> he wanted to start dating . </s>']

In [69]:
print("The average accuracy rate is: {}".format(np.mean(accuracy_lst)))

The average accuracy rate is: 0.09336658136611697


Clearly the LSTM fails to predict any valid sentence, and the weak accuracy rate stems from the first $<s>$ tag.

# Try the precomputed initialization

In [10]:
voc_dict = {}
with open('bobsue-data/glove.6B.200d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        vect = np.array(line[1:]).astype(np.float)
        voc_dict[word] = vect

In [11]:
voc_w2n_precom = {}
num_rep_precom =[]
for word in voc:
    try:
        v = voc_dict[word].tolist()
        voc_w2n_precom[word] = torch.FloatTensor(v)
        num_rep_precom.append(v)
    except: # some of the words in our vocabulary are not documented in the glove dictionary
        v = np.random.uniform(1,-1,200)
        voc_w2n_precom[word] = torch.FloatTensor(v)
        num_rep_precom.append(v)

In [12]:
num_rep_precom = torch.FloatTensor(num_rep_precom)

In [13]:
# Get the new representation of the training set when the pre-computed dictionary.
x_train_num_precom = []; y_train_num_precom = []
for i in range(len(x_train)):
    x_sen = [voc_w2n_precom[word] for word in x_train[i].split()]
    y_sen = [voc_w2n_precom[word] for word in y_train[i].split()]
    x_train_num_precom.append(x_sen); y_train_num_precom.append(y_sen)

In [28]:
# Train the model with the training data set
hparams = {
    'learning_rate': 0.1,
    'epochs': 30,
}

model_lst_precom = []
model = MyLSTM(200, 200)
opt = optim.SGD(model.parameters(), lr=hparams['learning_rate'], momentum=0.8, weight_decay=0.01)
epoch_losses = []
torch.manual_seed(1234)
np.random.seed(1234)
for i in range(hparams['epochs']):
    losses = []
    for j in np.random.choice(len(x_train_num), 1000):
        x_sen = x_train_num_precom[j]
        y_sen = y_train_num_precom[j][1:]
        c_old = torch.zeros(200)
        h_old = torch.zeros(200)
        for m in range(len(x_sen)):
            x = x_sen[m]
            c_old, h_old = model(c_old, h_old, x, 'encode')
        x = voc_w2n_precom['<s>']; word = '<s>'
        h_lst = []; count = 0
        while count<22 and word!='</s>':
            c_old, h_old = model(c_old, h_old, x, 'decode')
            h_lst.append(h_old)
            idx = find_closest(num_rep_precom, h_old)
            x = num_rep_precom[idx]; word = voc[idx]
            count +=1
        loss = compute_loss(h_lst, y_sen)*100
        if j%100==0: opt.param_groups[0]['lr']=50
        opt.zero_grad()
        loss.backward(retain_graph=True)
        opt.step()
        if j%100==0: opt.param_groups[0]['lr']=hparams['learning_rate']
        losses.append(loss)
    loss_mean = sum(losses)/len(losses)
    epoch_losses.append(loss_mean)
    model_lst_precom.append(deepcopy(model))
    print('Epoch {:4} | Loss {:.3f}'.format(i, loss_mean))
    print("-----------------------------------------------")

Epoch    0 | Loss 192.772
-----------------------------------------------
Epoch    1 | Loss 223.981
-----------------------------------------------
Epoch    2 | Loss 133.779
-----------------------------------------------
Epoch    3 | Loss 219.012
-----------------------------------------------
Epoch    4 | Loss 221.925
-----------------------------------------------


KeyboardInterrupt: 

I interrupted the training as the loss is picking up.

In [29]:
x_test_num_precom = []; y_test_num_precom = []
for i in range(len(x_test)):
    x_sen = [voc_w2n_precom[word] for word in x_test[i].split()]
    y_sen = [voc_w2n_precom[word] for word in y_test[i].split()]
    x_test_num_precom.append(x_sen); y_test_num_precom.append(y_sen)

In [45]:
model_trained_precom = model_lst_precom[2]
# Predict
losses = []; pred_sen = []
for i in range(len(x_test)):
    x_sen = x_test_num_precom[i]
    y_sen = y_test_num_precom[i][1:]
    c_old = torch.zeros(200)
    h_old = torch.zeros(200)
    for m in range(len(x_sen)):
        x = x_sen[m]
        c_old, h_old = model_trained_precom(c_old, h_old, x, 'encode')
    x = voc_w2n['<s>']; word = '<s>'
    word_lst = [word]; h_lst = []; count = 0
    while count<22 and word!='</s>':
        c_old, h_old = model_trained_precom(c_old, h_old, x, 'decode')
        h_lst.append(h_old)
        idx = find_closest(num_rep_precom, h_old)
        x = num_rep_precom[idx]
        word = voc[idx]; word_lst.append(word)
        count +=1
    loss = compute_loss(h_lst, y_sen)*100
    losses.append(loss); pred_sen.append(" ".join(word_lst))

In [46]:
print("The predicted sentences")
pred_sen[-10:]

The predicted sentences


['<s> unfortunately addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition',
 '<s> unfortunately addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition',
 '<s> unfortunately addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition',
 '<s> unfortunately addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition',
 '<s> unfortunately addition addition addition addition addition addition addition addition addition addition addition addition addition addition addition a

In [32]:
print("The ground true sentences")
y_test[-10:]

The ground true sentences


["<s> bob 's dad had his own basketball card when he was a professional . </s>",
 "<s> the picture didn 't look right . </s>",
 '<s> she was tired and decided to sleep . </s>',
 '<s> she bought bob a brand new bike ! </s>',
 '<s> bob really loved his job ! </s>',
 '<s> his grades got better . </s>',
 '<s> he loved it so much ! </s>',
 '<s> she then asked if other people thought this , they said no . </s>',
 '<s> the same day his wife sue bought a new chair for the living room . </s>',
 '<s> he wanted to start dating . </s>']

Clearly, using the pre-computed word embedding fails again.

# Some last comments:

All my model settings have failed no matter how I tune the learning rate and the number of epochs. And here are some other possible solutions I feel might work but did not try yet:
* Change the archeture of MyLSTM class. Now I am chaining the encoder and decoder together, which forces the two sets of weights will be updated at the same learning rate when using the pyTorch optimizer. A better strategy might be to learn these parameters at different rates, as I checked the model weights and found the weight magnitudes at the encoder were significantly smaller than the decoder.
* Another approach I have tried but did not have time to finish is to set the word embedding to be associated with a (1442,) vector, where the i'th word in the voc.txt takes the value of 1 at its associated vector. The benefits of doing so include that we can apply the softmax loss more easily, and each word outputs are easy to understand. The following is the model I experimented with this word representation, and with the softmax loss function. The training loss is declining but it is likely this approach will take forever for me to finish.

<img src="https://github.com/KenChenCompEcon/hw08/blob/master/training_loss.png?raw=true">