## NMT

Используя оригинальную модель Transformer обучим на данных news-commentary

In [0]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |▎                               | 10kB 16.9MB/s eta 0:00:01[K     |▋                               | 20kB 3.1MB/s eta 0:00:01[K     |█                               | 30kB 4.4MB/s eta 0:00:01[K     |█▎                              | 40kB 5.7MB/s eta 0:00:01[K     |█▋                              | 51kB 3.7MB/s eta 0:00:01[K     |██                              | 61kB 4.4MB/s eta 0:00:01[K     |██▏                             | 71kB 5.0MB/s eta 0:00:01[K     |██▌                             | 81kB 5.6MB/s eta 0:00:01[K     |██▉                             | 92kB 6.2MB/s eta 0:00:01[K     |███▏                            | 102kB 4.8MB/s eta 0:00:01[K     |███▌                            | 112kB 4.8MB/s eta 0:00:01[K     |███▉                     

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sentencepiece as spm
import torchtext
from torchtext import data
from torchtext import datasets
from nltk.translate.bleu_score import corpus_bleu
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np

from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!mkdir data/
!cp drive/My\ Drive/assignment_7/news-commentary-v13.ru-en.en data/
!cp drive/My\ Drive/assignment_7/news-commentary-v13.ru-en.ru data/

!cp drive/My\ Drive/assignment_7/transformer.py /content/
!cp drive/My\ Drive/assignment_7/beam_search.py /content/

In [5]:
from transformer import make_model
from tqdm import tqdm
from IPython.display import clear_output

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

----

### Предобработка данных

In [6]:
def get_data_obj(text_file, prefix, vocab_size=32000, char_coverage=0.98, model_type='bpe', is_src=False):
    with open(text_file) as f:
        dir_ = text_file.split('/')[0]
        path = '/'.join([dir_, 'text.'+ prefix.split('_')[1]])
        with open(path, 'w') as out:
            out.write(f.read().lower())
    
    request = ' '.join(["--input=" + path, 
                        "--model_prefix=" + prefix,
                        "--vocab_size=" + str(vocab_size),
                        "--character_coverage=" + str(char_coverage),
                        "--model_type=" + model_type])
    spm.SentencePieceTrainer.Train(request)
    token = spm.SentencePieceProcessor()
    token.load('.'.join([prefix , 'model']))
    tokenizer = lambda x: token.encode_as_pieces(x)
    obj = torchtext.data.Field(
        fix_length=50,
        pad_token='<pad>',
        unk_token='<unk>',
        init_token='<s>',
        eos_token='</s>',
        lower=True,
        tokenize = tokenizer,
        batch_first=True,
    )
    
    return obj

In [7]:
TGT = get_data_obj(text_file='data/news-commentary-v13.ru-en.en',
                   prefix="bpe_en")

SRC = get_data_obj(text_file="data/news-commentary-v13.ru-en.ru", is_src=True,
                    prefix='bpe_ru', char_coverage=1.)

In [8]:
fields = [('src', SRC), ('trg', TGT)]

In [9]:
with open('data/text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('data/text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)

split_id = round(len(examples[:-1000])*0.9)
train = data.Dataset(examples[:split_id], fields)
valid = data.Dataset(examples[split_id:-1000], fields)

235159it [00:59, 3927.59it/s]


In [10]:
print('src: ' + " ".join(train.examples[150].src))
print('tgt: ' + " ".join(train.examples[150].trg))

src: ▁несколько ▁недель ▁спустя , ▁я ▁столкнулась ▁с ▁одним ▁из ▁подобных ▁конфликтов : ▁собрание ▁членов ▁правления ▁здесь , ▁конференция ▁там , ▁и ▁еще ▁одна ▁перспектива ▁в ▁то ▁же ▁самое ▁время ▁еще ▁в ▁одном ▁месте .
tgt: ▁a ▁couple ▁of ▁weeks ▁later , ▁i ▁was ▁faced ▁with ▁one ▁of ▁those ▁conflicts : ▁a ▁board ▁meeting ▁here , ▁a ▁conference ▁there , ▁another ▁opportunity ▁at ▁the ▁same ▁time ▁somewhere ▁else .


In [11]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [12]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

In [13]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data).detach()
        return tgt_mask

class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.trg, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion(nn.Module):
    def __init__(self, pad_idx):
        super(MyCriterion, self).__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        
    def forward(self, x, target):
        x = x.contiguous().permute(0,2,1)
        ntokens = (target != self.pad_idx).data.sum()
        
        return self.criterion(x, target) / ntokens

In [14]:
torch.cuda.empty_cache()


train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(64, 64, 1), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=device,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

## Вспомогательные функции

In [15]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [16]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


-----

### Создание и обучение модели

In [17]:
torch.cuda.is_available()

False

In [18]:
pad_idx = TGT.vocab.stoi['<pad>']
model = make_model(len(SRC.vocab), len(TGT.vocab), N=2)
# model.cuda()
model_opt = get_std_opt(model)
criterion = MyCriterion(pad_idx)
# criterion.cuda();

In [0]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    counter = 0
    for batch in data_iter:
        pred = model.forward(batch)
        loss = criterion(pred, batch.trg_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        model_opt.step()
        
        total_loss += loss
        # data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    counter = 0
    for batch in data_iter:
        pred = model.forward(batch)
        clear_output(True)
        loss = criterion(pred, batch.trg_y).item()
        print(loss)
        total_loss += loss
        # data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

model.generator.proj.weight = model.tgt_embed[0].lut.weight

for epoch in range(10):
    model.train()
    loss = train_epoch(train_iter, model, criterion)
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        print('valid', loss)

4.104262828826904
valid 2.3165018174166234


In [64]:
def beam_search(model, src, src_mask, max_len=5, k=5):
    memory = model.encode(src, src_mask)
    start_s = torch.ones(1, 1).fill_(TGT.vocab.stoi['<s>']).type_as(src.data)
    beam = [(start_s, 0)]
    for i in range(max_len):
        chars = []
        probas = []
        for ys, log_prob in beam:
            if ys[0][-1] == TGT.vocab.stoi['</s>']:
                chars.append(ys)
                probas.append(log_prob)
            else:
                tgt_mask = subsequent_mask(ys.size(-1)).type_as(src.data).detach().to(device)
                probs = model.decode(ys, tgt_mask, memory, src_mask)[0][i]
                probs, idxs = torch.topk(probs, k)
                for idx, new_c in enumerate(idxs.squeeze(0)):
                    new_c_tensor = torch.ones(1, 1).type_as(src.data).fill_(new_c).to(device)
                    chars.append(torch.cat([ys, new_c_tensor], dim=1))
                    probas.append(log_prob + np.log(probs[idx].item()))
        probas = torch.tensor(probas)
        vals, idxs = torch.topk(probas, k)
        beam = [(c, p.item()) for c, p in zip(chars, probas)]
    return beam

In [0]:
torch.save(model.state_dict(), 'model_de_best.pth.tar')

In [0]:
!cp model_de_best.pth.tar drive/My\ Drive/assignment_7/

In [0]:
model_path = 'drive/My Drive/assignment_7/model_de_best.pth.tar'

model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))
model.eval();

In [None]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in test_iter:        
        src = batch.src[:1]
        trg = batch.trg[:1][0]
        src_mask = src != SRC.vocab.stoi["<pad>"]

        beam = beam_search(model, src, src_mask)

        pred, log_proba = beam[0]
        tokens = pred[0]

        new_hyp = []
        for i in range(1, len(tokens)):
            sym = TGT.vocab.itos[tokens[i]]
            if sym == '</s>': break
            new_hyp.append(sym)
        hypotheses.append(new_hyp)

        new_ref = []
        for i in range(1, len(trg)):
            sym = TGT.vocab.itos[trg[i]]
            if sym == '</s>': break
            new_ref.append(sym)
        references.append([new_ref])

In [0]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [0]:
final_score = corpus_bleu(references[1:], hypotheses[1:],
                          smoothing_function=SmoothingFunction().method3,
                          auto_reweigh=True)

In [0]:
print("final score is {}".format(final_score))

final score is 0.19645950075706967


BLEU == 0.2

Оценим количество неизвестных токенов в предложениях

In [0]:
unk_token = TGT.vocab.stoi['<unk>']

general_unk_freq = []
for tokenized in references[1:]:
    unk_per_sentence = 0
    for token in tokenized[0]:
        if TGT.vocab.stoi[token] == unk_token:
            unk_per_sentence += 1
    unk_per_sentence /= len(tokenized[0])
    general_unk_freq.append(unk_per_sentence)

general_unk_freq = np.array(general_unk_freq)

In [0]:
general_unk_freq[general_unk_freq == 0.] = np.nan
mean = np.nanmean(general_unk_freq)

In [0]:
print(mean)

0.038851899386000324


In [0]:
unk_token = TGT.vocab.stoi['<unk>']

sum_unks = 0
for tokenized in references[1:]:
    for token in tokenized[0]:
        if TGT.vocab.stoi[token] == unk_token:
            sum_unks += 1
            break

print(sum_unks/len(references[1:]))

0.1061061061061061


Видно, что предложений с оригинальным переводом, в которых встречаются неизвестные токены около 3%, но тем не менее модель на 20ти эпохах с функцией потерь Перекрестная энтропия не дает сильно хорошего качества.