In [3]:
import string
# read in data, remove punctuation and make all lower case
def read_words(filename):
    words = []
    translator = str.maketrans('', '', string.punctuation)
    scount = 0
    with open(filename) as f:
        for s in f:
            scount += 1
            clean_s = s.translate(translator).lower()
            words.append(clean_s.split())
    return words
        

In [4]:
# create dict that returns the index of onehot encoding for a word (and other way around)
# also create a frequency dict + set size, usuable for negative sampling
import numpy as np

def get_onehot_dicts(corpus):
    # create one set of all unique words
    flat_corpus = [w for s in corpus for w in s]
    corpus_set = set(flat_corpus)
    w_to_i = {}
    i_to_w = {}
    w_freq = []
    num_words = len(corpus_set)
    for i, w in enumerate(corpus_set):
        w_to_i[w] = i
        i_to_w[i] = w
        freq = flat_corpus.count(w)**0.75
        w_freq.append([i, freq])
    return w_to_i, i_to_w, np.array(w_freq), num_words

In [5]:
#corpus = read_words('wa/test.en')
l1_corpus = read_words('wa/dev.en')
l2_corpus = read_words('wa/dev.fr')
print('corpus created')
w_to_i1, i_to_w1, w_freq1, num_words1 = get_onehot_dicts(l1_corpus)
w_to_i2, i_to_w2, w_freq2, num_words2 = get_onehot_dicts(l2_corpus)


corpus created


In [6]:
# save w_to_i and i_to_w to files
import pickle

with open('w2i_en_embedalign.pkl', 'wb') as f:
    pickle.dump(w_to_i1, f)
    
with open('i2w_en_embedalign.pkl', 'wb') as f:
    pickle.dump(i_to_w1, f)
    
with open('w2i_fr_embedalign.pkl', 'wb') as f:
    pickle.dump(w_to_i2, f)
    
with open('i2w_fr_embedalign.pkl', 'wb') as f:
    pickle.dump(i_to_w2, f)

In [73]:
# transform corpus to lists of ints
import torch

l1_corpus_i = [[w_to_i1[word] for word in sentence] for sentence in l1_corpus]
l2_corpus_i = [[w_to_i2[word] for word in sentence] for sentence in l2_corpus]

total_corpus_i = [l1_corpus_i, l2_corpus_i]
# cannot turn into longtensor because not all sentences have same length

In [98]:
import torch.nn as nn
import torch.distributions as dist
import torch.utils.data
from torch.utils.data import sampler
class embed_align(nn.Module):
    def __init__(self, vocab_size1, vocab_size2, emb_dimension):
        super(embed_align, self).__init__()
        self.vocab_size1 = vocab_size1
        self.vocab_size2 = vocab_size2
        self.emb_dimension = emb_dimension

        self.embedding = nn.Embedding(vocab_size1, emb_dimension)
        self.BiLSTM = nn.LSTM(emb_dimension, emb_dimension, bidirectional=True)
        
        self.affine1_mu = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_mu = nn.Linear(emb_dimension, emb_dimension)
        
        self.affine1_sig = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_sig = nn.Linear(emb_dimension, emb_dimension)
        
        self.affine1_L1 = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_L1 = nn.Linear(emb_dimension, vocab_size1)
        self.affine1_L2 = nn.Linear(emb_dimension, emb_dimension)
        self.affine2_L2 = nn.Linear(emb_dimension, vocab_size2)
        
        self.relu = nn.ReLU()
        self.softplus = nn.Softplus()
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, sentence1, sentence2):
        # sentence1 & sentence2 are (batches of) list of all ints in a sentence
        # encoder
        m = len(sentence1)
        sen1_emb = self.embedding(sentence1)
        h, _ = self.BiLSTM(sen1_emb.unsqueeze(1))
        h1, h2 = torch.split(h, split_size_or_sections=self.emb_dimension, dim =2)
        h = h1 + h2
        mu = self.affine2_mu(self.relu(self.affine1_mu(h)))
        sig = self.relu(self.affine2_sig(self.relu(self.affine1_sig(h))))
        
        sample_norm = dist.multivariate_normal.MultivariateNormal(torch.zeros(self.emb_dimension), torch.eye(self.emb_dimension))
        e = sample_norm.sample()
        z = mu + e * sig
        # remove once generate part finished
        z = z.squeeze(1)
    
        # generate
        dist_1 = self.softmax(self.affine2_L1(self.relu(self.affine1_L1(z))))
        dist_2 = self.softmax(self.affine2_L2(self.relu(self.affine1_L2(z))))

        cat_dist_1 = dist.Categorical(dist_1)
        cat_dist_2 = dist.Categorical(dist_2)
        
        x = cat_dist_1.sample()
        y = cat_dist_2.sample()
        
        # compute loss
        
        
        

In [99]:
import torch
ea_model = embed_align(num_words1, num_words2, 200)
i_1 = [w_to_i1[w] for w in l1_corpus[0]]
i_2 = [w_to_i2[w] for w in l2_corpus[0]]

i_1 = torch.LongTensor(i_1)
i_2 = torch.LongTensor(i_2)

ea_model.forward(i_1, i_2)
