In [5]:
import string
# read in data, remove punctuation and make all lower case
def read_words(filename):
    words = []
    translator = str.maketrans('', '', string.punctuation)
    scount = 0
    longest = 0
    with open(filename, encoding="utf8") as f:
        for s in f:
            scount += 1
            clean_s = s.translate(translator).lower()
            words.append(clean_s.split())
            # keep track of longest sentence
            if len(s) > longest:
                 longest = len(s)
    return words, longest

In [6]:
# create dict that returns the index of onehot encoding for a word (and other way around)
# also create a frequency dict + set size, usuable for negative sampling
import numpy as np

def get_onehot_dicts(corpus):
    # create one set of all unique words
    flat_corpus = [w for s in corpus for w in s]
    corpus_set = set(flat_corpus)
    w_to_i = {}
    i_to_w = {}
    w_freq = []
    num_words = len(corpus_set)
    for i, w in enumerate(corpus_set):
        # all indices + 1 to use zero padding later
        w_to_i[w] = i + 1
        i_to_w[i + 1] = w
        freq = flat_corpus.count(w)**0.75
        w_freq.append([i, freq])
    return w_to_i, i_to_w, np.array(w_freq), num_words+1

In [7]:
# get top words
# def get_onehot_dicts_sampling(corpus):
#     flat_corpus = [w for s in corpus for w in s]
#     corpus_set = set(flat_corpus)
#     w_to_i = {}
#     i_to_w = {}
#     w_freq = []

#     for i, w in enumerate(corpus_set):
#         freq = flat_corpus.count(w)**0.75
#         w_freq.append([i, freq])
    
#     sorted_frequencies = w_freq.sort(key=lambda x: x[1])
#     smaller_corpus = [property_a[i] for i in good_indices]

In [8]:
#corpus = read_words('wa/test.en')
l1_corpus, longest1 = read_words('wa/test.en')
l2_corpus, longest2 = read_words('wa/test.fr')
print('corpus created')
import time
start_time = time.time()
w_to_i1, i_to_w1, w_freq1, num_words1 = get_onehot_dicts(l1_corpus)
w_to_i2, i_to_w2, w_freq2, num_words2 = get_onehot_dicts(l2_corpus)
print(time.time()-start_time)

corpus created
0.32686305046081543


In [9]:
# save w_to_i and i_to_w to files
import pickle

with open('w2i_en_embedalign.pkl', 'wb') as f:
    pickle.dump(w_to_i1, f)
    
with open('i2w_en_embedalign.pkl', 'wb') as f:
    pickle.dump(i_to_w1, f)
    
with open('w2i_fr_embedalign.pkl', 'wb') as f:
    pickle.dump(w_to_i2, f)
    
with open('i2w_fr_embedalign.pkl', 'wb') as f:
    pickle.dump(i_to_w2, f)

In [10]:
# transform corpus to lists of ints
import torch

# convert to indexes and pad
l1_corpus_i = torch.LongTensor([[0] * (longest1-len(sentence)) + [w_to_i1[word] for word in sentence] for sentence in l1_corpus])
l2_corpus_i = torch.LongTensor([[0] *(longest2-len(sentence)) + [w_to_i2[word] for word in sentence] for sentence in l2_corpus])


In [21]:
import torch.nn as nn
import torch.distributions as dist
import torch.utils.data
from torch.utils.data import sampler
from torch.distributions import kl
class embed_align(nn.Module):
    def __init__(self, vocab_size1, vocab_size2, emb_dimension):
        super(embed_align, self).__init__()
        self.vocab_size1 = vocab_size1
        self.vocab_size2 = vocab_size2
        self.emb_dimension = emb_dimension

        self.embedding = nn.Embedding(vocab_size1, emb_dimension, padding_idx = 0)
        self.BiLSTM = nn.LSTM(emb_dimension, emb_dimension, bidirectional=True, batch_first=True)
        
        self.affine1_mu = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_mu = nn.Linear(emb_dimension, emb_dimension)
        
        self.affine1_sig = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_sig = nn.Linear(emb_dimension, emb_dimension)
        
        self.affine1_L1 = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_L1 = nn.Linear(emb_dimension, vocab_size1)
        self.affine1_L2 = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_L2 = nn.Linear(emb_dimension, vocab_size2)
        
        self.relu = nn.ReLU()
        self.softplus = nn.Softplus()
        self.log_softmax = nn.Softmax(dim=0)
        
    def forward(self, sentence1, sentence2, use_cuda=False):
        # sentence1 & sentence2 are (batches of) list of all ints in a sentence
        # encoder
        sen1_emb = self.embedding(sentence1)
        if len(sen1_emb.shape) == 2: # not a batch
            sen1_emb = sen1_emb.unsqueeze(0)
        h, _ = self.BiLSTM(sen1_emb)
        h1, h2 = torch.split(h, split_size_or_sections=self.emb_dimension, dim =2)
        h = h1 + h2
        mu = self.affine2_mu(self.relu(self.affine1_mu(h)))
        sig = self.relu(self.affine2_sig(self.relu(self.affine1_sig(h))))
        
        sample_norm = dist.multivariate_normal.MultivariateNormal(torch.zeros(self.emb_dimension), torch.eye(self.emb_dimension))
        e = sample_norm.sample()
        if use_cuda:
            z = mu + e.cuda() * sig
        else:
            z = mu + e * sig
    
        # likelihood language 1
        dist_1 = self.log_softmax(self.affine2_L1(self.relu(self.affine1_L1(z))))
        # sum over batch
        sum_1 = torch.sum(dist_1, dim=0)
        likelihood_1 = torch.mean(sum_1, dim=1)
        total_likelihood1 = 0
        sen_len = 0
        for i, likelihood in enumerate(likelihood_1):
            # no batches:
            if len(sentence1) == longest1:
                if sentence1[i].item() == 0:
                    continue
                total_likelihood1 += likelihood
                sen_len +=1
            else:
                for j in range(len(sentence1)):
                    if sentence1[j][i].item() == 0:
                        continue
                    total_likelihood1 += likelihood
                sen_len +=1
        likelihood1 = total_likelihood1/sen_len
        
        # likelihood language 2
        dist_2 = self.log_softmax(self.affine2_L2(self.relu(self.affine1_L2(z))))
        sum_2 = torch.sum(dist_2, dim=0)
        likelihood_2 = torch.mean(sum_2, dim=1)
        total_likelihood2 = 0
        sen_len = 0
        for i, likelihood in enumerate(likelihood_2):
            # no batches:
            if len(sentence1) == longest1:
                if sentence1[i].item() == 0:
                    continue
                total_likelihood2 += likelihood
                sen_len +=1
            else:
                for j in range(len(sentence1)):
                    if sentence1[j][i].item() == 0:
                        continue
                    total_likelihood2 += likelihood
                sen_len +=1
        likelihood2 = total_likelihood2/sen_len
        
        # KL
        # to prevent log returning infinity
        sig = sig+1e-8
        KL =  -0.5 * torch.sum(1 + torch.log(sig) - mu.pow(2) - sig)
        return - ((likelihood1 + likelihood2) - KL)
        
        

In [32]:
import time
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

start_time = time.time()

use_cuda = False#torch.cuda.is_available()

ea_model = embed_align(num_words1, num_words2, 300)

if use_cuda:
    ea_model.cuda()
optimizer = torch.optim.Adam(ea_model.parameters(), lr=0.01)
ea_model.train()
loss_progress = []
iter_time = time.time()

total_data = torch.utils.data.TensorDataset(l1_corpus_i, l2_corpus_i)
dataloader = DataLoader(total_data, batch_size=10)
for i, batch in enumerate(dataloader):
    batch_l1 = batch[0]
    batch_l2 = batch[1]
    optimizer.zero_grad()

    if use_cuda:    
        loss = ea_model.forward(batch_l1.cuda(), batch_l2.cuda(), use_cuda=True)
    else:
        loss = ea_model.forward(batch_l1, batch_l2)
    loss.backward()
    optimizer.step()
    if i % 10 == 0:
        print("at batch ", i)
        print(time.time()-iter_time)
        print(loss)
        iter_time = time.time()

at batch  0
0.31984686851501465
tensor(1.00000e+06 *
       2.6598, device='cuda:0')
at batch  10
3.5031001567840576
tensor(60003.9844, device='cuda:0')
at batch  20
3.6315455436706543
tensor(33596.2930, device='cuda:0')
at batch  30
3.697462558746338
tensor(16170.1621, device='cuda:0')
at batch  40
3.676900625228882
tensor(15591.5479, device='cuda:0')


In [12]:
import torch
ea_model = embed_align(num_words1, num_words2, 200)
b = ea_model.forward(l1_corpus_i, l2_corpus_i)
print(b)

tensor(7.1000e+07)
