## Загрузка данных

Флаг _-nc_ позволяет не скачивать файлы, если они уже есть. 

In [1]:
!wget -nc https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/train.lc.norm.tok.fr
!wget -nc https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/train.lc.norm.tok.en
!wget -nc https://s3.amazonaws.com/arrival/embeddings/wiki.multi.en.vec
!wget -nc https://s3.amazonaws.com/arrival/embeddings/wiki.multi.fr.vec

File ‘train.lc.norm.tok.fr’ already there; not retrieving.

File ‘train.lc.norm.tok.en’ already there; not retrieving.

File ‘wiki.multi.en.vec’ already there; not retrieving.

File ‘wiki.multi.fr.vec’ already there; not retrieving.



## Определение вспомогательных функций

In [2]:
import pickle
import numpy as np
import unicodedata
import string
import re
import torch
from torch import nn
from torch import optim
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

cuda


In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s.strip()

def read_sentences(path):
    lines = []
    with open(path) as f:
        for line in f:
            lines.append(normalizeString(line))
    return lines

In [4]:
class Vocabulary:
    """
    Tracks info about known words, their indices and frequences.
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.dummies = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
        for i, name in enumerate(self.dummies):
            self.word2index[name] = i
            self.word2count[name] = 0
        self.index2word = self.dummies.copy()
        self.n_words = len(self.dummies)
   
    def add_list(self, list):
        for s in list:
            self.add_sentence(s)
    
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_seq(self, seq):
        for word in seq:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:   
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word.append(word)
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
def seq_format(seq, max_words):
    nwords = len(seq)
    seq_new = seq + ["<EOS>"]
    seq_new += ["<PAD>" for i in range(max_words - nwords)]
    return seq_new

def in_lang(seq, lang):
    for w in seq:
        if not (w in lang.word2index):
            return False
    return True

def freq_words(seq, lang, freq):
    for w in seq:
        if lang.word2count[w] < freq:
            return False
    return True

# Leave only known words and given length
def prepare_list(list, lang, max_words, freq):
    list_seq = [s.split() for s in list]
    list_clean = []
    for s in list_seq:
        if in_lang(s, lang) and len(s) <= max_words and freq_words(s, lang, freq):
            list_clean.append(s)
        else:
            list_clean.append(None)
    
    list_format = []
    for s in list_clean:
        if s == None:
            list_format.append(s)
        else:
            list_format.append(seq_format(s, max_words))
            
    return list_format

In [32]:
def seq2vec(seq, lang):
    return [lang.word2index[w] for w in seq]

def words_list2tensor(list, lang):
    ind_list = [seq2vec(x, lang) for x in list]
    vec_list = [torch.LongTensor(s) for s in ind_list]
    return torch.stack(vec_list)

def ind_seq2word_seq(seq, lang):
    return [lang.index2word[v] for v in seq]

def vec_seq2word_seq(seq, lang):
    return [lang.vec2word(v) for v in seq]

## Модели

### Загрузка данных

In [5]:
fr_sents = read_sentences('train.lc.norm.tok.fr')
en_sents = read_sentences('train.lc.norm.tok.en')

In [6]:
fra = Vocabulary('fr')
eng = Vocabulary('en')

In [7]:
fra.add_list(fr_sents)
eng.add_list(en_sents)

In [10]:
freq = 5
max_len = 10

# Format sentences
fr_clean = prepare_list(fr_sents, fra, max_len, freq)
en_clean = prepare_list(en_sents, eng, max_len, freq)

fr_pair = []
en_pair = []
fr_nopair = []
en_nopair = []

for s_fr, s_en in zip(fr_clean, en_clean):
    if s_en == None and s_fr == None:
        continue
    if s_en == None:
        fr_nopair.append(s_fr)
    elif s_fr == None:
        en_nopair.append(s_en)
    else:
        fr_pair.append(s_fr) 
        en_pair.append(s_en)
        
print('Pairs left:', len(fr_pair))
print('French non-paired:', len(fr_nopair))
print('English non-paired:', len(en_nopair))
print('--> Garbage:', len(fr_clean) - len(fr_pair) - len(fr_nopair))

Pairs left: 3537
French non-paired: 937
English non-paired: 3052
--> Garbage: 24526


### Переводчик слово-в-слово с attention и софтмаксом на выходах

In [11]:
class ClassTranslator(nn.Module):
    def __init__(self, in_max_len, in_nlabels, out_max_len, out_nlabels, hidden_dim):
        super().__init__()
        self.emb_src = nn.Embedding(in_nlabels, hidden_dim)
        self.in_max_len = in_max_len
        self.out_max_len = out_max_len
        
        self.shrink = nn.Linear(self.emb_src.embedding_dim, hidden_dim)
        self.cont = nn.Linear(in_max_len, out_max_len)
#         self.select_cont = nn.Linear(hidden_dim, 1)
        self.reduce = nn.Linear(out_max_len, 1)
        self.attn_matr = nn.Linear(hidden_dim, in_max_len*out_max_len)
        self.non_lin = nn.ReLU()
        
        self.classifier = nn.Linear(hidden_dim, out_nlabels)
        
    def forward(self, in_sent):
        # Encode
        input = self.emb_src(in_sent)
        context = self.cont(input.transpose(1, 2))
        shrinked = self.non_lin(self.shrink(context.transpose(1, 2)))
        
        # Decode
#         selected = torch.softmax(self.select_cont(shrinked), dim=1)
#         attn = self.non_lin(self.attn_matr(shrinked))
#         masked = torch.bmm(selected.transpose(1, 2), attn)
#         out = torch.squeeze(masked).view(-1, self.out_max_len, self.in_max_len)
#         attn = torch.softmax(out, dim=2)
#         attn_applied = torch.bmm(attn, shrinked)

        reduced = self.non_lin(self.reduce(shrinked.transpose(1, 2)))
        attn = self.non_lin(self.attn_matr(reduced.transpose(1, 2)))
        out = attn.view(-1, self.out_max_len, self.in_max_len)
        attn = torch.softmax(out, dim=2)
        attn_applied = torch.bmm(attn, shrinked)
    
        out = self.classifier(attn_applied)
        out = torch.log_softmax(out, dim=2)
        
        return out, attn

In [12]:
batch_size = 100
n_epochs = 1000
criterion = torch.nn.NLLLoss()

X = words_list2tensor(fr_pair, fra)
Y = words_list2tensor(en_pair, eng)

n_samples, max_len = X.shape
hidden_dim = 5

LOAD = False

if LOAD:
    tr = torch.load('class_tr')
else:
    tr = ClassTranslator(max_len, len(fra.index2word),
                         max_len, len(eng.index2word),
                         hidden_dim)

optimizer = torch.optim.Adam(tr.parameters())
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
                                                 milestones=[], 
                                                 gamma=0.1)

if device.type == "cuda":
    X = X.cuda()
    Y = Y.cuda()
    tr = tr.cuda()

for epoch in range(n_epochs):
    inds = torch.randperm(n_samples).split(batch_size)
    for ind in inds:
        X_batch = X[ind]
        Y_batch = Y[ind]
        
        out, attn = tr(X_batch)
        
        loss = criterion(out.transpose(1, 2), Y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    scheduler.step(loss.data)
    if not epoch % 50: 
        torch.save(tr, 'class_tr')
        print(epoch, loss.data)
        print(ind_seq2word_seq(Y_batch[0].cpu(), eng))
        _, ind = out[0].cpu().topk(1, dim=1)
        print(ind_seq2word_seq(ind, eng))

  "type " + obj.__name__ + ". It won't be checked "


0 tensor(9.0293, device='cuda:0')
['a', 'soccer', 'player', 'is', 'kicking', 'the', 'ball', '.', '<EOS>', '<PAD>', '<PAD>']
['wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing', 'wearing']


KeyboardInterrupt: 

### Тупой переводчик: трёхслойная сеть

In [13]:
class StupidTranslator(nn.Module):
    def __init__(self, in_max_len, in_nlabels, out_max_len, out_nlabels, hidden_dim, tmp_len=5):
        super().__init__()
        self.emb_src = nn.Embedding(in_nlabels, hidden_dim)
        self.in_max_len = in_max_len
        self.out_max_len = out_max_len
        
        self.W = nn.Linear(in_max_len, tmp_len)
        self.U = nn.Linear(tmp_len, tmp_len)
        self.Q = nn.Linear(tmp_len, out_max_len)
        self.non_lin = nn.ReLU()
        
        self.classifier = nn.Linear(hidden_dim, out_nlabels)
        
    def forward(self, in_sent):
        # Encode
        input = self.emb_src(in_sent)
        
        out = self.non_lin(self.W(input.transpose(1, 2)))
        out = self.non_lin(self.U(out))
        out = self.non_lin(self.Q(out))
    
        out = self.classifier(out.transpose(1, 2))
        out = torch.log_softmax(out, dim=2)
        
        return out, attn

In [14]:
batch_size = 50
n_epochs = 1000
criterion = torch.nn.NLLLoss()

X = words_list2tensor(fr_pair, fra)
Y = words_list2tensor(en_pair, eng)

n_samples, max_len = X.shape
hidden_dim = 5

LOAD = False

if LOAD:
    tr = torch.load('stupid_tr')
else:
    tr = StupidTranslator(max_len, len(fra.index2word),
                         max_len, len(eng.index2word),
                         hidden_dim)

optimizer = torch.optim.Adam(tr.parameters())
pad_ind = eng.word2index['<PAD>']
if device.type == "cuda":
    X = X.cuda()
    Y = Y.cuda()
    tr = tr.cuda()

for epoch in range(n_epochs):
    inds = torch.randperm(n_samples).split(batch_size)
    for ind in inds:
        X_batch = X[ind]
        Y_batch = Y[ind]
        
        out, attn = tr(X_batch)
        
        out[Y_batch == pad_ind][pad_ind] = 0
        loss = criterion(out.transpose(1, 2), Y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if not epoch % 50: 
        torch.save(tr, 'stupid_tr')
        print(epoch, loss.data)
        print(ind_seq2word_seq(Y_batch[0].cpu(), eng))
        _, ind = out[0].cpu().topk(1, dim=1)
        print(ind_seq2word_seq(ind, eng))

  "type " + obj.__name__ + ". It won't be checked "


0 tensor(8.8732, device='cuda:0')
['a', 'girl', 'sits', 'on', 'a', 'swing', 'ride', '.', '<EOS>', '<PAD>', '<PAD>']
['.', 'rock', 'rock', 'rock', 'rock', '.', 'scrolled', 'rock', '.', 'rock', 'scrolled']


KeyboardInterrupt: 

### Переводчик вектор-в-вектор с натренированными заранее представлениями

In [20]:
def load_vec(emb_path, max_words=-1):
    vectors = []
    word2id = {}
    it = 0
    with open(emb_path) as f:
        # Skip first line
        next(f)
        for line in f:
            if max_words != -1 and it > max_words:
                break
            it += 1
            orig_word, vect = line.rstrip().split(' ', 1)
            
            word = normalizeString(orig_word)
            vect = np.fromstring(vect, sep=' ')
       
            # Words are sorted by frequency, no need to add less 
            # frequent version of the same word  
            if not (word in word2id):
                vectors.append(vect)
                word2id[word] = len(word2id)

    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

class Lang:
    def __init__(self, name, embedding_tuple):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.dummies = ["<SOS>", "<EOS>", "<PAD>"]
        for i, name in enumerate(self.dummies):
            self.word2index[name] = i
        self.index2word = self.dummies.copy()
        self.embedding_tuple = embedding_tuple    
        self.n_words = len(self.dummies) # Count SOS and EOS
        self.embedding = None # No initial embedding

    def add_list(self, list):
        for s in list:
            self.addSentence(s)
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:   
            if word in self.embedding_tuple[2]:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word.append(word)
                self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def build_embedding(self):
        """
        Получаем матрицу слово -> вектор для всех слов, которые встретились в тексте.
        Вектор для начала предложения заменяем нулевым, 
        для конца предложения --- единичным (можно заменить на случайный вектор).
        """
        dim = self.embedding_tuple[0].shape[1]
        # Add 2 dims for start/end
        dlen = len(self.dummies)
        matrix = np.zeros((self.n_words, dim+dlen))        
        for i in range(dlen):
            matrix[i, dim+i] = 1

        for id, word in enumerate(self.index2word[dlen:]):
            id = id+dlen
            word_id = self.embedding_tuple[2][word]
            vector = self.embedding_tuple[0][word_id]
            matrix[id, :-dlen] = vector
        
        self.embedding = torch.Tensor(matrix)
        
    def word2vec(self, word):
        return self.embedding[self.word2index[word]]
    
    def vec2word(self, vec):
        tmp1 = self.embedding/torch.norm(self.embedding, dim=1, keepdim=True)
        tmp2 = vec/torch.norm(vec)
        dist = torch.mm(tmp1, tmp2[:, None])
        _, ind = torch.topk(dist, 1, dim=0)
        return self.index2word[ind]

In [15]:
class ContextTranslator(nn.Module):
    def __init__(self, emb_src, emb_tgt, in_max_len, out_max_len, hidden_dim, ncontext):
        super().__init__()
        self.emb_src = emb_src
        self.emb_tgt = emb_tgt
        self.in_max_len = in_max_len
        self.out_max_len = out_max_len
        
        self.shrink = nn.Linear(self.emb_src.embedding_dim, hidden_dim)
        self.cont = nn.Linear(in_max_len, ncontext)
        self.select_cont = nn.Linear(hidden_dim, 1)
        self.attn_matr = nn.Linear(hidden_dim, in_max_len*out_max_len)
        self.non_lin = nn.ReLU()
        
    def forward(self, input):
        context = self.cont(input.transpose(1, 2))
        
        shrinked = self.non_lin(self.shrink(context.transpose(1, 2)))
        
        selected = torch.softmax(self.select_cont(shrinked), dim=1)
        attn = self.non_lin(self.attn_matr(shrinked))
        
        masked = torch.bmm(selected.transpose(1, 2), attn)
        out = torch.squeeze(masked).view(-1, self.out_max_len, self.in_max_len)
        attn = torch.softmax(out, dim=2)
        attn_applied = torch.bmm(attn, input)
        
        return attn_applied, attn

In [25]:
en_embedding_tuple = load_vec('./wiki.multi.en.vec')
fr_embedding_tuple = load_vec('./wiki.multi.fr.vec')
fra = Lang('fr', fr_embedding_tuple)
eng = Lang('en', en_embedding_tuple)

In [26]:
fra.add_list(fr_sents)
fra.build_embedding()

eng.add_list(en_sents)
eng.build_embedding()

In [27]:
freq = 5
max_len = 10

# Format sentences
fr_clean = prepare_list(fr_sents, fra, max_len, freq)
en_clean = prepare_list(en_sents, eng, max_len, freq)

fr_pair = []
en_pair = []
fr_nopair = []
en_nopair = []
for s_fr, s_en in zip(fr_clean, en_clean):
    if s_en == None and s_fr == None:
        continue
    if s_en == None:
        fr_nopair.append(s_fr)
    elif s_fr == None:
        en_nopair.append(s_en)
    else:
        fr_pair.append(s_fr) 
        en_pair.append(s_en)

In [36]:
batch_size = 50
n_epochs = 1000
criterion = torch.nn.MSELoss()

X = words_list2tensor(fr_pair, fra)
Y = words_list2tensor(en_pair, eng)

n_samples, max_len = X.shape
hidden_dim = 5
ncontext = 5

en_emb = torch.nn.Embedding.from_pretrained(eng.embedding)
fr_emb = torch.nn.Embedding.from_pretrained(fra.embedding)

LOAD = False

if LOAD:
    tr = torch.load('cont_tr')
else:
    tr = ContextTranslator(fr_emb, en_emb, max_len, max_len, hidden_dim, ncontext)

optimizer = torch.optim.Adam(tr.parameters())

if device.type == "cuda":
    en_emb = en_emb.cuda()
    fr_emb = fr_emb.cuda()
    X = X.cuda()
    Y = Y.cuda()
    tr = tr.cuda()

for epoch in range(n_epochs):
    inds = torch.randperm(n_samples).split(batch_size)
    for ind in inds:
        X_batch = fr_emb(X[ind])
        Y_batch = en_emb(Y[ind])
        
        out, attn = tr(X_batch)
        loss = criterion(out, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if not epoch % 50: 
        torch.save(tr, 'cont_tr')
        print(epoch, loss.data)
        print(vec_seq2word_seq(Y_batch[0].cpu(), eng))
        print(vec_seq2word_seq(out[0].cpu(), eng))

  "type " + obj.__name__ + ". It won't be checked "


0 tensor(0.0025, device='cuda:0')
['a', 'man', 'rollerblading', 'on', 'a', 'metal', 'bar', '.', '<EOS>', '<PAD>', '<PAD>']
['a', 'a', 'a', 'a', 'roller', 'a', 'a', 'a', 'and', 'a', 'and']
50 tensor(0.0022, device='cuda:0')
['two', 'men', 'with', 'backpacks', 'wait', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['two', 'men', 'and', 'and', 'and', 'and', 'and', 'and', '<EOS>', '<PAD>', '<PAD>']
100 tensor(0.0022, device='cuda:0')
['two', 'girls', 'are', 'walking', 'along', 'the', 'street', 'and', 'talking', '.', '<EOS>']
['two', 'girls', 'and', 'the', 'the', 'and', 'and', 'and', '.', '<EOS>', '<PAD>']
150 tensor(0.0021, device='cuda:0')
['a', 'little', 'boy', 'takes', 'a', 'picture', 'at', 'the', 'park', '.', '<EOS>']
['a', 'small', 'girl', 'takes', 'a', 'photograph', 'the', 'the', '.', '<EOS>', '<PAD>']
200 tensor(0.0021, device='cuda:0')
['a', 'man', 'at', 'a', 'food', 'cart', 'is', 'serving', 'corn', '.', '<EOS>']
['a', 'man', 'a', 'a', 'the', 'the', 'and', 'but', 'but', '<EOS>',

KeyboardInterrupt: 