In [1]:
import numpy as np
import string
import re
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from collections import defaultdict
import torch.utils.data
import time
import math
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

NameError: name 'defaultdict' is not defined

In [None]:
# global paramters
batch_size = 37
embedding_dims = 128
vocab_size_limit = 10000
en_file = "wa/dev.en"
fr_file = "wa/dev.fr"
num_epochs = 30
learning_rate = 0.01
max_sentence_len = 50
hidden_dim = 100

In [None]:
# preproces corpus
en_corpus = []
fr_corpus = []
en_sentences = []
fr_sentences = []
allow = string.ascii_letters + " "

with open("wa/stopwords.en") as sw: # stopwords file is from nltk https://www.nltk.org/data.html
    en_stop_words = sw.read().split()
    
with open("wa/stopwords.fr") as sw: # stopwords file is from nltk https://www.nltk.org/data.html
    fr_stop_words = sw.read().split()

# read  corpus and remove characteres not in allow
with open(en_file) as f:
    for line in f:
        line = re.sub('[^%s]' % allow, '', line)
        words = line.split()
        words = [x.lower() for x in words]
        en_corpus.append(words)
        
        tmp = []
        for word in words:
            if word not in en_stop_words:
                tmp.append(word)
            else:
                tmp.append("<unk>")
        en_sentences.append(tmp)
        
# read  corpus and remove characteres not in allow
with open(fr_file) as f:
    for line in f:
        line = re.sub('[^%s]' % allow, '', line)
        words = line.split()
        words = [x.lower() for x in words]
        fr_corpus.append(words)
        
        tmp = []
        for word in words:
            if word not in fr_stop_words:
                tmp.append(word)
            else:
                tmp.append("<unk>")
        fr_sentences.append(tmp) 

In [None]:
# snippit to limit to get a count for each word 
# and limit the vocabulary by vocab_size_limit

# count the number of times each word appears
en_vocab = defaultdict(lambda: 0)
for line in en_corpus:
    for word in line:
        if word not in en_stop_words:
            en_vocab[word] += 1
            
fr_vocab = defaultdict(lambda: 0)
for line in fr_corpus:
    for word in line:
        if word not in fr_stop_words:
            fr_vocab[word] += 1
            
print(len(en_vocab))
print(len(fr_vocab))

# sort by count and limit the corpus by vocab_size_limit, 
tmp = sorted(en_vocab.items(), key = lambda w: w[1],reverse=True)[:vocab_size_limit]
if vocab_size_limit <= len(tmp):
    en_vocab = set(map(lambda word: word[0], tmp[:vocab_size_limit]))
else:
    en_vocab = set(map(lambda word: word[0], tmp))
    
tmp = sorted(fr_vocab.items(), key = lambda w: w[1],reverse=True)[:vocab_size_limit]
if vocab_size_limit <= len(tmp):
    fr_vocab = set(map(lambda word: word[0], tmp[:vocab_size_limit]))
else:
    fr_vocab = set(map(lambda word: word[0], tmp))
    
# add unk word, assigns all words not in vocab (stopwords/infrequent) to it
en_vocab.add('<unk>')
fr_vocab.add('<unk>')
en_vocab.add('<pad>')
fr_vocab.add('<pad>')



In [None]:
# create word2index and index2word

# word to index and vice versa
en_word2idx = {w: idx for (idx, w) in enumerate(en_vocab)}
en_idx2word = {idx: w for (idx, w) in enumerate(en_vocab)}

fr_word2idx = {w: idx for (idx, w) in enumerate(fr_vocab)}
fr_idx2word = {idx: w for (idx, w) in enumerate(fr_vocab)}

en_vocab_size = len(en_vocab)
fr_vocab_size = len(en_vocab)

# Change the words in the sentence by the vocabulary index
en_sentence_indexs = []
fr_sentence_indexs = []
for en, fr in zip(en_sentences,fr_sentences):
    en_sentence_indexs.append([en_word2idx[word] for word in en[:max_sentence_len]])
    fr_sentence_indexs.append([fr_word2idx[word] for word in fr[:max_sentence_len]])


In [None]:

# get the lengths of all sentences
seq_lengths = torch.LongTensor([len(s) for s in en_sentence_indexs]).cuda()
fr_seq_lengths = torch.LongTensor([len(s) for s in fr_sentence_indexs]).cuda()
en_pad = en_word2idx['<pad>']
fr_pad = fr_word2idx['<pad>']

# pad sequences
def pad_sequences(sentence_indexs, seq_lengths, pad):
    seq_tensor = torch.zeros((1,1)).new_full((len(sentence_indexs),seq_lengths.max()),pad).long().cuda()
    print(seq_tensor.size())
    for idx, (seq, seqlen) in enumerate(zip(sentence_indexs, seq_lengths)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    return seq_tensor

seq_tensor = pad_sequences(en_sentence_indexs, seq_lengths, en_pad)
fr_seq_tensor = pad_sequences(fr_sentence_indexs, fr_seq_lengths, fr_pad)

def sort_batch(en, fr, lengths ,fr_lengths):

    seq_lengths, perm_idx = lengths.sort(0, descending=True)
    seq_tensor = en[perm_idx]

    targ_tensor = fr[perm_idx]
    fr_len = fr_lengths[perm_idx]
    return seq_tensor, targ_tensor, seq_lengths, fr_len


In [None]:
class PaddedTensorDataset(torch.utils.data.Dataset):


    def __init__(self, data_tensor, target_tensor, length_tensor, length_target):
        assert data_tensor.size(0) == target_tensor.size(0) == length_tensor.size(0) == length_target.size(0)
        self.data_tensor = data_tensor
        self.target_tensor = target_tensor
        self.length_tensor = length_tensor
        self.length_target = length_target

    def __getitem__(self, index):
        return self.data_tensor[index], self.target_tensor[index], self.length_tensor[index], self.length_target[index]

    def __len__(self):
        return self.data_tensor.size(0)

In [None]:
# function to sort the paddedTensorDataset
# its called sort_batch because it was originally used per batch,
# but its better to do it once with the whole dataset.
def sort_batch(en, fr, lengths ,fr_lengths):

    seq_lengths, perm_idx = lengths.sort(0, descending=True)
    seq_tensor = en[perm_idx]

    targ_tensor = fr[perm_idx]
    fr_len = fr_lengths[perm_idx]
    return seq_tensor, targ_tensor, seq_lengths, fr_len

In [None]:
# create dataset
dataset = torch.utils.data.DataLoader(PaddedTensorDataset(seq_tensor, fr_seq_tensor, seq_lengths, fr_seq_lengths), batch_size=batch_size)

In [None]:
# Embed align model, not really needed for this assignment
# But some code could prove usefull?

class embed_align(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(embed_align, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.M1 = nn.Linear(hidden_dim, hidden_dim)
        self.M2 = nn.Linear(hidden_dim, hidden_dim)
        self.N1 = nn.Linear(hidden_dim, vocab_size)
        self.N2 = nn.Linear(hidden_dim, vocab_size)
        self.lstm = nn.LSTM(embed_size, hidden_dim, bidirectional=True, batch_first=True)
        self.softplus = nn.Softplus()    
            
            
    def forward(self, x, lengths):
        embeddings = self.embeddings(x)

        packed_input = pack_padded_sequence(embeddings, lengths, batch_first=True)
        packed_output, (ht, ct) = self.lstm(packed_input)
        lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True)

        x = lstm_out
        x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) 

        u = self.M1(x)
        s = self.M2(x)
        s = self.softplus(s)


        
        m = torch.distributions.normal.Normal(u, s)
        z = m.sample()

        
        e = self.N1(z)
        f = self.N2(z)
        
        return(z, e, f, u, s)
        
         

In [None]:
# initialize model functions
ea = embed_align(en_vocab_size, embedding_dims).cuda()
loss_function_avg = nn.CrossEntropyLoss().cuda()
loss_function_sum = nn.CrossEntropyLoss(size_average=False).cuda()
optimizer = torch.optim.Adam(ea.parameters(), learning_rate)

In [None]:
# not usefull for this project, just here to give context
def get_loss(e1 ,en1,  f1 , fr1 , u1, s1):
    loss = []
    for e,en,f,fr,u,s in zip(e1 ,en1,  f1 , fr1 , u1, s1):
        # english loss

        e_loss =  loss_function_sum(e,en)
        #print(e_loss)


        # french loss
        e_len = len(e)
        f_losses = []
        for fr_value in fr:
            target = en.new_full((e_len, 0), fr_value)
            f_losses.append( loss_function_avg(f, target) )


        f_loss = sum(f_losses)
        #print(f_loss)
        # Kl divergence


        # kl_loss = torch.mean(0.5 * torch.sum(torch.exp(s) + u**2 - 1. - s, 1))
#         prior = torch.distributions.normal.Normal(torch.FloatTensor([0]).cuda(),torch.FloatTensor([1]).cuda())
#         posterior = torch.distributions.normal.Normal(u, s)
#         kl_loss = (torch.distributions.kl.kl_divergence(posterior, prior))
#         print(kl_loss)
        
        
        kl = []
        prior = torch.distributions.multivariate_normal.MultivariateNormal(torch.ones(hidden_dim).cuda(), torch.eye(hidden_dim).cuda())

        # if sentence has only one word
        if(len(u.size()) == 1):
            posterior = torch.distributions.multivariate_normal.MultivariateNormal(u, torch.diag(s))
            kl.append(torch.distributions.kl.kl_divergence(posterior, prior).sum())
        else:
            for u2, s2 in zip(u,s):
                posterior = torch.distributions.multivariate_normal.MultivariateNormal(u2, torch.diag(s2))
                kl.append(torch.distributions.kl.kl_divergence(posterior, prior).sum())

        kl_loss = sum(kl)
#         print(kl_loss)

        loss.append( -e_loss -f_loss +kl_loss ) 
    #print(loss)
    return(sum(loss))

In [None]:
# train function, not neccesairly usefull for this project

epoch_losses = []

iterations = math.ceil(len(en_sentences) / batch_size)


for epoch in range(0, num_epochs):
    start = time.time()
    epoch_loss = 0
    for en_batch, fr_batch, lengths, fr_lengths in dataset:

        en_batch, fr_batch , lengths, fr_lengths = sort_batch(en_batch, fr_batch , lengths, fr_lengths)
        batch_losses = []
        batch_start = time.time()
        batch = 0
        en = torch.autograd.Variable(en_batch)
        fr = torch.autograd.Variable(fr_batch)
        z, e, f, u, s = ea(en, lengths)
        
        loss = get_loss(e, en, f, fr, u, s)
        
        optimizer.zero_grad()
        epoch_loss += loss


        loss.backward()
        optimizer.step()
        batch_time = time.time() - batch_start 
        print('\r[Epoch {:03d}/{:03d}] Batch {:06d}/{:06d} [{:.1f}/s] '.format(epoch+1, num_epochs, batch, iterations, batch_time), end='')

    epoch_loss /= iterations
    #score = evaluator.lst(net.embeddings.weight.data)
    print('Time: {:.1f}s Loss: {:.3f} LST: {:.6f}'.format(time.time() - start, epoch_loss, 0))

    epoch_losses.append(epoch_loss)
        