In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import numpy as np
import nltk


from collections import Counter

class Field:
    def __init__(self, tokens, pad_token, init_token=None, eos_token=None):
        # Data: 2D strings list
        self.data = tokens
        self.pad_token = pad_token
        self.init_token = init_token
        self.eos_token = eos_token
        self.vocab = None
        self.lookup = None
        self.reverse_lookup = None
        self.tensor = None
        self.max_sent_length = max([len(x) for x in self.data])
    
    def build_vocab(self, min_freq: int):
        # MIN_FREQ not actually needed
        buffer = list()
        for i in range(len(self.data)):
            for k in range(len(self.data[i])):
                buffer.append(self.data[i][k])
        self.vocab = sorted(list(set(buffer)))
        self.lookup = {value: index for index, value in enumerate(self.vocab)}
        self.reverse_lookup = {index: value for index, value in enumerate(self.vocab)}
    
    def build_tensor(self, pad_to:int=None):
        if pad_to is None:
            pad_to = self.max_sent_length
        self.tensor = torch.zeros(len(self.data), pad_to, dtype=int)
        for i in range(len(self.data)):
            for j in range(len(self.data[i])):
                self.tensor[i, j] = self.lookup.get(self.data[i][j])
        return self.tensor


class Iterator:
    def __init__(self, src: Field, trg: Field, batch_size=200, pad_to=100):
        if len(src.data) != len(trg.data):
            raise "Shit!"
        self.src_tensor = src.build_tensor(pad_to=pad_to)
        self.trg_tensor = trg.build_tensor(pad_to=pad_to)
        self.batch_size = batch_size
        self.batch_counter = 0
        self.batch_amount = len(src.data) // self.batch_size
    
    def iterate(self):
        if self.batch_counter < self.batch_amount:
            self.batch_counter += 1
            return self.batch_counter, (
                self.src_tensor[self.batch_size * (self.batch_counter-1):self.batch_counter * self.batch_size,
                                :],
                self.trg_tensor[self.batch_size * (self.batch_counter-1):self.batch_counter * self.batch_size,
                                :])
        else:
            return -1, (None, None)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * np.sqrt(self.d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class MyTransformer(nn.Module):
    def __init__(self, 
                 d_model: int = 128, 
                 nhead: int = 8, 
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6, 
                 dim_feedforward: int = 128, 
                 dropout: float = 0.2,
                 activation: str = "relu", 
                 source_vocab_length: int = 60000, 
                 target_vocab_length: int = 60000) -> None:
        super(MyTransformer, self).__init__()
        self.source_embedding = nn.Embedding(source_vocab_length, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
        self.target_embedding = nn.Embedding(target_vocab_length, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.out = nn.Linear(d_model, target_vocab_length)
        self._reset_parameters()
        self.d_model = d_model
        self.nhead = nhead

    def forward(self, 
                src: torch.Tensor, 
                tgt: torch.Tensor, 
                src_mask: torch.Tensor = None, 
                tgt_mask: torch.Tensor = None,
                memory_mask: torch.Tensor = None, 
                src_key_padding_mask: torch.Tensor = None,
                tgt_key_padding_mask: torch.Tensor = None, 
                memory_key_padding_mask: torch.Tensor = None):
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        src = self.source_embedding(src)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        output = self.out(output)
        return output

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

## Read data, build dataset

In [42]:
en_list, ru_list = list(), list()
len_dataset = 800 # 1000000

with open('1mcorpus/corpus.en_ru.1m.en', 'r') as f:
    en_list = [nltk.tokenize.word_tokenize(x, language='english') for x in f.readlines()[:len_dataset]]
with open('1mcorpus/corpus.en_ru.1m.ru', 'r') as f:
    ru_list = [nltk.tokenize.word_tokenize(x, language='russian') for x in f.readlines()[:len_dataset]]


BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
BATCH_SIZE = 40

SRC = Field(ru_list, pad_token=BLANK_WORD)
TGT = Field(en_list, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD)

MIN_FREQ = 2

SRC.build_vocab(MIN_FREQ)
TGT.build_vocab(MIN_FREQ)

source_vocab_length = len(SRC.vocab)
target_vocab_length = len(TGT.vocab)
print(f'We got {source_vocab_length} in source, {target_vocab_length} in target')

train_iter = Iterator(SRC, TGT, BATCH_SIZE, pad_to=max([SRC.max_sent_length, TGT.max_sent_length]))
val_iter = Iterator(SRC, TGT, 1, pad_to=max([SRC.max_sent_length, TGT.max_sent_length]))

model = MyTransformer(source_vocab_length=source_vocab_length, target_vocab_length=target_vocab_length)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-6)

model = model.to(device)

We got 7513 in source, 5317 in target


## Train & validate

In [43]:
def train(train_iter, val_iter, model, optim, num_epochs, use_gpu=True):
    train_losses = []
    valid_losses = []
    print("Let's go!")
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        train_iter.batch_counter=0
        val_iter.batch_counter=0
        # Train model
        model.train()
        for i in range(train_iter.batch_amount):
            cnt, batch = train_iter.iterate()
            # print(cnt, '/', train_iter.batch_amount, batch[0].shape, batch[1].shape)
            src = batch[0]
            trg = batch[1]
            # change to shape (bs , max_seq_len)
            src = src.transpose(0, 1)
            # change to shape (bs , max_seq_len)
            trg = trg.transpose(0, 1)
            
            trg_input = trg[:, :]
            targets = trg[:, :].contiguous().view(-1)
            size = trg_input.size(1)
            np_mask = torch.triu(torch.ones(size, size) == 1).transpose(0, 1)
            np_mask = np_mask.float().masked_fill(
                np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            
            # Forward, backprop, optimizer
            optim.zero_grad()
            preds = model(src.transpose(0, 1), trg_input.transpose(0, 1), tgt_mask=np_mask)
            preds = preds.transpose(0, 1).contiguous().view(-1, preds.size(-1))
            loss = F.cross_entropy(preds, targets, ignore_index=0, reduction='sum')
            loss.backward()
            optim.step()
            train_loss += loss.item() / BATCH_SIZE

        model.eval()
        with torch.no_grad():
            for i in range(val_iter.batch_amount):
                cnt, batch = val_iter.iterate()
                src = batch[0]
                trg = batch[1]
                # change to shape (bs , max_seq_len)
                src = src.transpose(0, 1)
                # change to shape (bs , max_seq_len+1) , Since right shifted
                trg = trg.transpose(0, 1)
                trg_input = trg[:, :-1]
                targets = trg[:, 1:].contiguous().view(-1)
                size = trg_input.size(1)
                # print(size)
                np_mask = torch.triu(torch.ones(size, size) == 1).transpose(0, 1)
                np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(
                    np_mask == 1, float(0.0))
                
                preds = model(src.transpose(0, 1), trg_input.transpose(0, 1), tgt_mask=np_mask)
                preds = preds.transpose(0, 1).contiguous().view(-1, preds.size(-1))
                loss = F.cross_entropy(preds, targets, ignore_index=0, reduction='sum')
                valid_loss += loss.item() / 1

        # Log after each epoch
        print(
            f'''Epoch [{epoch + 1}/{num_epochs}] complete. Train Loss: {train_loss / train_iter.batch_amount:.3f}. Val Loss: {valid_loss / val_iter.batch_amount:.3f}''')

        train_losses.append(train_loss / train_iter.batch_amount)
        valid_losses.append(valid_loss / val_iter.batch_amount)

    return train_losses, valid_losses


train(train_iter=train_iter, val_iter=val_iter, model=model, optim=optim, num_epochs=1)

Let's go!
Epoch [1/1] complete. Train Loss: 202.867. Val Loss: 0.000


([202.86664611816408], [0.0])

In [49]:
model.eval()
sentence = [SRC.reverse_lookup.get(i) for i in range(1000,1010)]
indexed = []
for tok in sentence:
    if SRC.lookup.get(tok) is not None:
        indexed.append(SRC.lookup.get(tok))
    else:
        indexed.append(0)

sentence = torch.LongTensor([indexed])
print(indexed)
trg_init_tok = 0
trg = torch.LongTensor([[trg_init_tok]])
translated_sentence = ""
for i in range(len(sentence)):
    size = trg.size(0)
    np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
    np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
    pred = model(sentence.transpose(0,1), trg, tgt_mask = np_mask).argmax(dim=2)[-1]
    print(pred)
    add_word = TGT.reverse_lookup.get(pred.detach().numpy()[0])
    translated_sentence+=str(add_word)
    if add_word==EOS_WORD:
        break
    trg = torch.cat((trg,torch.LongTensor([[pred[-1]]])))
# TGT.reverse_lookup.get(11960)
translated_sentence

[1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009]
tensor([4907])


'the'

In [53]:
wf = open('answers.txt', 'w')

with open('test.ru.txt', 'r') as rf:
    results = []
    for line in rf.readlines():
        sentence = nltk.tokenize.word_tokenize(line, language='russian')
        indexed = []
        for tok in sentence:
            if SRC.lookup.get(tok) is not None:
                indexed.append(SRC.lookup.get(tok))
            else:
                indexed.append(0)
        sentence = torch.LongTensor([indexed])
        trg_init_tok = 0
        trg = torch.LongTensor([[trg_init_tok]])
        translated_sentence = ""
        for i in range(len(sentence)):
            size = trg.size(0)
            np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
            np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            pred = model(sentence.transpose(0,1), trg, tgt_mask = np_mask).argmax(dim=2)[-1]
            add_word = TGT.reverse_lookup.get(pred.detach().numpy()[0])
            translated_sentence+=" "+str(add_word)
            if add_word==EOS_WORD:
                break
            trg = torch.cat((trg,torch.LongTensor([[pred[-1]]])))
        results.append(translated_sentence)
    line = '\n'.join(results)
    print(len(results))
    wf.write(line)
    wf.close()

500
