In [1]:
!pip install sentencepiece

[0m

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m817.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [34]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m897.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.3.1
[0m

# Подключение зависимостей

In [3]:
import os
import math
import random
import time
from typing import Union, List, Tuple, Optional, Any

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor

import evaluate

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 15})

# Модель

In [4]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout_prob=0.0):
        """
        :param embedding_dim: dimension of en embedding
        :param num_heads: number of heads
        :param dropout_prob: probability for dropout layer
        """
        super().__init__()
        assert embedding_dim % num_heads == 0

        # number of heads and embedding and head dimensions
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        # linear layers using in Q, K and V calculation
        self.WQ = nn.Linear(embedding_dim, embedding_dim)
        self.WK = nn.Linear(embedding_dim, embedding_dim)
        self.WV = nn.Linear(embedding_dim, embedding_dim)
        self.WO = nn.Linear(embedding_dim, embedding_dim)

        # dropout layer to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)

        # coefficient for scaling using in self-attention calculation
        self.norm_factor = np.sqrt(self.head_dim)

    def forward(self, query, key, value, mask=None):
        """
        :param query: input embeddings, (batch_size, length, embedding_dim)
        :param key: input embeddings, (batch_size, length, embedding_dim)
        :param value: input embeddings, (batch_size, length, embedding_dim)
        :param mask: marks the elements we shouldn't pay attention to
        :return: recalculated embeddings
        """
        batch_size = query.shape[0]

        # compute Q, K and V using linear layers, (batch_size, length, embedding_dim)
        Q = self.WQ(query)
        K = self.WK(key)
        V = self.WV(value)

        # split the embedding_dim of the Q, K and V into num_heads
        # (batch_size, length, embedding_dim) -> (batch_size, num_heads, length, head_dim)
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # calculate dot product (energy), (batch_size, num_heads, length, length)
        dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.norm_factor
        # mask the dot_product, so we do not pay attention over any elements of the sequence we shouldn't
        if mask is not None:
            dot_product = dot_product.masked_fill(mask == 0, -math.inf)

        # calculate self-attention score, (batch_size, num_heads, length, length)
        attention_score = torch.softmax(dot_product, dim=-1)

        # calculate self-attention, (batch_size, length, num_heads, head_dim)
        attention = torch.matmul(self.dropout(attention_score), V).permute(0, 2, 1, 3).contiguous()
        # combine all heads into one of size embedding_dim
        # (batch_size, length, num_heads, head_dim) -> (batch_size, length, embedding_dim)
        attention = attention.view(batch_size, -1, self.embedding_dim)

        # apply linear layer to get the output, (batch_size, length, embedding_dim)
        output = self.WO(attention)

        return output, attention_score


class FeedforwardLayer(nn.Module):
    def __init__(self, embedding_dim, feedforward_dim, dropout_prob=0.0):
        """
        :param embedding_dim: dimension of en embedding
        :param feedforward_dim: inner-layer dimensionality (usually larger than embedding_dim)
        :param dropout_prob: probability for dropout layer
        """
        super().__init__()

        # position wise feedforward
        self.feedforward = nn.Sequential(
            nn.Linear(embedding_dim, feedforward_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(feedforward_dim, embedding_dim),
        )

    def forward(self, inputs):
        """
        :param inputs: input embeddings, (batch_size, length, embedding_dim)
        :return: embeddings after feedforward operation
        """
        return self.feedforward(inputs)


class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, num_heads, feedforward_dim,
                 dropout_prob=0.0, max_length=200, device='cpu'):
        """
        :param input_dim: dimension of an input
        :param embedding_dim: dimension of an embedding
        :param num_layers: number of encoder layers
        :param num_heads: number of heads in attention layer in each encoder layer
        :param feedforward_dim: dimension in position wise feedforward layer in each encoder layer
        :param dropout_prob: probability for dropout layer
        :param max_length: maximum length of tokens in string
        :param device: cuda or cpu
        """
        super().__init__()
        self.device = device

        # token embedding
        self.token_embedding = nn.Embedding(input_dim, embedding_dim)

        # positional embedding (make it trainable as in BERT)
        self.positional_embedding = nn.Embedding(max_length, embedding_dim)

        # dropout layer to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)

        # coefficient for scaling token embedding to reduce model variance
        self.norm_factor = np.sqrt(embedding_dim)

        # encoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(embedding_dim, num_heads, feedforward_dim, dropout_prob) for _ in range(num_layers)
        ])

    def forward(self, source_tokens, source_mask):
        """
        :param source_tokens: input source tokens, (batch_size, length)
        :param source_mask: marks the elements equal to <pad>, (batch_size, 1, 1, length)
        :return: sequence of context vectors
        """
        batch_size, length = source_tokens.shape[0], source_tokens.shape[1]

        # encode source tokens and add positional encoding
        positions = torch.arange(0, length).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        outputs = self.norm_factor * self.token_embedding(source_tokens) + self.positional_embedding(positions)

        # apply dropout
        outputs = self.dropout(outputs)

        # apply each encoder layer to outputs
        for encoder_layer in self.encoder_layers:
            outputs = encoder_layer(outputs, source_mask)

        return outputs


class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, feedforward_dim, dropout_prob=0.0):
        """
        :param embedding_dim: dimension of en embedding
        :param num_heads: number of heads in attention layer
        :param feedforward_dim: dimension in position wise feedforward layer
        :param dropout_prob: probability for dropout layer
        """
        super().__init__()

        # self-attention layer
        self.self_attention = MultiHeadAttentionLayer(embedding_dim, num_heads, dropout_prob)

        # feedforward layer
        self.feedforward = FeedforwardLayer(embedding_dim, feedforward_dim, dropout_prob)

        # normalization layers that applying after self-attention and feedforward layers
        self.layer_norm_sa = nn.LayerNorm(embedding_dim)
        self.layer_norm_ff = nn.LayerNorm(embedding_dim)

        # dropout layer to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, inputs, source_mask):
        """
        :param inputs: input embeddings, (batch_size, length, embedding_dim)
        :param source_mask: (batch_size, 1, 1, length)
        :return: sequence of context vectors
        """
        # apply self-attention
        self_attention_output, _ = self.self_attention(inputs, inputs, inputs, source_mask)

        # apply dropout to output, add residual connection and layer norm
        outputs = self.layer_norm_sa(inputs + self.dropout(self_attention_output))

        # apply position wise feedforward
        feedforward_output = self.feedforward(outputs)

        # apply dropout to output, add residual connection and layer norm
        outputs = self.layer_norm_ff(outputs + self.dropout(feedforward_output))

        return outputs


class Decoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, num_heads, feedforward_dim,
                 dropout_prob=0.0, max_length=200, device='cpu'):
        """
        :param input_dim: dimension of an input
        :param embedding_dim: dimension of an embedding
        :param num_layers: number of decoder layers
        :param num_heads: number of heads in attention layer in each decoder layer
        :param feedforward_dim: dimension in position wise feedforward layer in each decoder layer
        :param dropout_prob: probability for dropout layer
        :param max_length: maximum length of tokens in string
        :param device: cuda or cpu
        """
        super().__init__()
        self.device = device

        # token embedding
        self.token_embedding = nn.Embedding(input_dim, embedding_dim)

        # positional embedding (make it trainable as in BERT)
        self.positional_embedding = nn.Embedding(max_length, embedding_dim)

        # dropout layer to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)

        # coefficient for scaling token embedding to reduce model variance
        self.norm_factor = np.sqrt(embedding_dim)

        # decoder layers
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(embedding_dim, num_heads, feedforward_dim, dropout_prob) for _ in range(num_layers)
        ])

        # final linear layer of decoder
        self.output_layer = nn.Linear(embedding_dim, input_dim)

    def forward(self, target_tokens, encoded_source_embed, target_mask, source_mask):
        """
        :param target_tokens: input target tokens, (batch_size, length)
        :param encoded_source_embed: encoded source embeddings taken from Encoder, (batch_size, length, embedding_dim)
        :param target_mask: marks the elements each token is allowed to look at, (batch_size, 1, length, length)
        :param source_mask: marks the elements equal to <pad>, (batch_size, 1, 1, length)
        :return: sequence of context vectors and attention
        """
        batch_size, length = target_tokens.shape[0], target_tokens.shape[1]

        # encode target tokens and add positional encoding
        positions = torch.arange(0, length).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        outputs = self.norm_factor * self.token_embedding(target_tokens) + self.positional_embedding(positions)

        # apply dropout
        outputs = self.dropout(outputs)

        # apply each decoder layer to outputs
        attention = 0
        for decoder_layer in self.decoder_layers:
            outputs, attention = decoder_layer(outputs, encoded_source_embed, target_mask, source_mask)

        # apply final linear layer
        outputs = self.output_layer(outputs)

        return outputs, attention


class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, feedforward_dim, dropout_prob=0.0):
        """
        :param embedding_dim: dimension of en embedding
        :param num_heads: number of heads in attention layer
        :param feedforward_dim: dimension in position wise feedforward layer
        :param dropout_prob: probability for dropout layer
        """
        super().__init__()

        # self-attention and encoder-attention layers
        self.self_attention = MultiHeadAttentionLayer(embedding_dim, num_heads, dropout_prob)
        self.encoder_attention = MultiHeadAttentionLayer(embedding_dim, num_heads, dropout_prob)

        # feedforward layer
        self.feedforward = FeedforwardLayer(embedding_dim, feedforward_dim, dropout_prob)

        # normalization layers that applying after self-attention encoder-attention and feedforward layers
        self.layer_norm_sa = nn.LayerNorm(embedding_dim)
        self.layer_norm_ea = nn.LayerNorm(embedding_dim)
        self.layer_norm_ff = nn.LayerNorm(embedding_dim)

        # dropout layer to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, inputs, encoded_source_embed, target_mask, source_mask):
        """
        :param inputs: input target embeddings, (batch_size, length, embedding_dim)
        :param encoded_source_embed: encoded source embeddings taken from Encoder, (batch_size, length, embedding_dim)
        :param target_mask: marks the elements each token is allowed to look at, (batch_size, 1, length, length)
        :param source_mask: marks the elements equal to <pad>, (batch_size, 1, 1, length)
        :return: sequence of context vectors and attention
        """

        # apply self-attention
        self_attention_output, _ = self.self_attention(inputs, inputs, inputs, target_mask)

        # apply dropout to output, add residual connection and layer norm
        outputs = self.layer_norm_sa(inputs + self.dropout(self_attention_output))

        # encoder attention
        self_attention_output, attention = self.encoder_attention(
            outputs, encoded_source_embed, encoded_source_embed, source_mask
        )

        # apply dropout to output, add residual connection and layer norm
        outputs = self.layer_norm_ea(outputs + self.dropout(self_attention_output))

        # apply position wise feedforward
        feedforward_output = self.feedforward(outputs)

        # apply dropout to output, add residual connection and layer norm
        outputs = self.layer_norm_ff(outputs + self.dropout(feedforward_output))

        return outputs, attention


class TranslationModel(nn.Module):
    def __init__(self, encoder, decoder, dataset, device='cpu'):
        """
        :param encoder: encoder model
        :param decoder: decoder model
        :param dataset: dataset for translation model
        :param device: cuda or cpu
        """
        super().__init__()
        self.device = device

        # required in inference mode to decode source sentences
        self.dataset = dataset

        # <pad> indices in source and target sequences
        self.pad_id = dataset.pad_id

        # encoder and decoder models
        self.encoder = encoder
        self.decoder = decoder

    def get_source_mask(self, source):
        pad_mask = (source != self.pad_id).unsqueeze(1).unsqueeze(2)
        return pad_mask

    def get_target_mask(self, target):
        length = target.shape[1]
        pad_mask = (target != self.pad_id).unsqueeze(1).unsqueeze(2)
        sub_mask = torch.tril(torch.ones((length, length), device=self.device)).bool()
        return pad_mask & sub_mask

    def forward(self, source, target):
        # get masks
        source_mask = self.get_source_mask(source)
        target_mask = self.get_target_mask(target)
        # get encoded source embedding
        encoded_source_embed = self.encoder(source, source_mask)
        # get output and attention
        output, attention = self.decoder(target, encoded_source_embed, target_mask, source_mask)

        return output, attention

    @torch.inference_mode()
    def inference(self, sentence, max_length=200):
        """
        :param sentence: input sentence to translate
        :param max_length: maximum number of tokens in translated string
        :return: translated sentence and attention
        """
        # switch model to eval mode
        self.eval()

        # encode sentence and create its mask
        source_tokens = torch.tensor(
            [self.dataset.bos_id] + self.dataset.text2ids(sentence, 'source') + [self.dataset.eos_id],
            dtype=torch.int64
        ).unsqueeze(0).to(self.device)
        source_mask = self.get_source_mask(source_tokens)

        # compute encoded source embedding
        encoded_source_embed = self.encoder(source_tokens, source_mask)

        # create target sentence, continue process until reach max_length or get EOS token
        target_tokens_arr = [self.dataset.bos_id]
        while len(target_tokens_arr) < max_length and target_tokens_arr[-1] != self.dataset.eos_id:
            # make target tensor and create its mask
            target_tokens = torch.tensor(target_tokens_arr, dtype=torch.int64).unsqueeze(0).to(self.device)
            target_mask = self.get_target_mask(target_tokens)

            # get output and attention
            output, attention = self.decoder(target_tokens, encoded_source_embed, target_mask, source_mask)

            # predict next token in translated sentence
            new_target_token = output.argmax(2)[:, -1].item()

            # add new token to result token array
            target_tokens_arr.append(new_target_token)

        return self.dataset.ids2text(target_tokens_arr, 'target')

# Датасет

In [5]:
class TextDataset(Dataset):
    def __init__(self, data_file_en: str, data_file_de: str, sp_model_prefix: str = None,
                 vocab_size: int = 3000, normalization_rule_name: str = 'nmt_nfkc_cf',
                 model_type: str = 'bpe', max_length: int = 200, langs=['de', 'en']):
        """
        Dataset with texts, supporting BPE tokenizer
        :param data_file_en: txt file containing texts in English
        :param data_file_de: txt file containing texts in German
        :param sp_model_prefix: path prefix to save tokenizer model
        :param vocab_size: sentencepiece tokenizer vocabulary size
        :param normalization_rule_name: sentencepiece tokenizer normalization rule
        :param model_type: sentencepiece tokenizer model type
        :param max_length: maximal length of text in tokens
        """
        # init models for two given languages
        self.sp_source = self.get_sp_model(data_file_de, langs[0], sp_model_prefix,
                                           vocab_size, normalization_rule_name, model_type)
        self.sp_target = self.get_sp_model(data_file_en, langs[1], sp_model_prefix,
                                           vocab_size, normalization_rule_name, model_type)

        # read lines of given texts for both source and target languages
        self.source_texts, self.target_texts = [], []
        with open(data_file_de, encoding='utf8') as file:
            self.source_texts = file.readlines()
        with open(data_file_en, encoding='utf8') as file:
            self.target_texts = file.readlines()

        # convert read texts to tokens using models
        self.source_tokens = self.sp_source.encode(self.source_texts)
        self.target_tokens = self.sp_target.encode(self.target_texts)

        # save config informatoion about model
        self.pad_id, self.unk_id, self.bos_id, self.eos_id = \
            self.sp_source.pad_id(), self.sp_source.unk_id(), \
            self.sp_source.bos_id(), self.sp_source.eos_id()
        self.vocab_size_source = self.sp_source.vocab_size()
        self.vocab_size_target = self.sp_target.vocab_size()
        self.max_length = max_length

    def get_sp_model(self, data_file, lang, sp_model_prefix, vocab_size, normalization_rule_name, model_type):
        if not os.path.isfile(sp_model_prefix + '_' + lang + '.model'):
            # train tokenizer if not trained yet
            SentencePieceTrainer.train(
                input=data_file, vocab_size=vocab_size,
                model_type=model_type, model_prefix=sp_model_prefix + '_' + lang,
                normalization_rule_name=normalization_rule_name,
                unk_id=0, bos_id=1, eos_id=2, pad_id=3
            )
        # load tokenizer from file
        return SentencePieceProcessor(model_file=sp_model_prefix + '_' + lang + '.model')

    def text2ids(self, texts: Union[str, List[str]], text_type: str) -> Union[List[int], List[List[int]]]:
        """
        Encode a text or list of texts as tokenized indices
        :param texts: text or list of texts to tokenize
        :param text_type: source or target
        :return: encoded indices
        """
        return self.sp_source.encode(texts) if text_type == 'source' else self.sp_target.encode(texts)

    def ids2text(self, ids: Union[torch.Tensor, List[int], List[List[int]]], text_type: str) -> Union[str, List[str]]:
        """
        Decode indices as a text or list of tokens
        :param ids: 1D or 2D list (or torch.Tensor) of indices to decode
        :param text_type: source or target
        :return: decoded texts
        """
        if torch.is_tensor(ids):
            assert len(ids.shape) <= 2, 'Expected tensor of shape (length, ) or (batch_size, length)'
            ids = ids.cpu().tolist()

        return self.sp_source.decode(ids) if text_type == 'source' else self.sp_target.decode(ids)

    def __len__(self):
        """
        Size of the dataset
        :return: number of texts in the dataset
        """
        return len(self.source_tokens)

    def __getitem__(self, item: int) -> Tuple[torch.Tensor, int]:
        """
        Add specials to the index array and pad to maximal length
        :param item: text id
        :return: encoded source and target texts
        """
        # get encoded source line and add PAD token
        encoded_source_text = [self.bos_id] + self.source_tokens[item] + [self.eos_id]
        padded_source_text = torch.full((self.max_length,), self.pad_id, dtype=torch.int64)
        padded_source_text[:len(encoded_source_text)] = torch.tensor(encoded_source_text)

        # get encoded target line and add PAD token
        encoded_target_text = [self.bos_id] + self.target_tokens[item] + [self.eos_id]
        padded_target_text = torch.full((self.max_length,), self.pad_id, dtype=torch.int64)
        padded_target_text[:len(encoded_target_text)] = torch.tensor(encoded_target_text)

        return padded_source_text, padded_target_text

In [6]:
def set_model_weights(model):
    if hasattr(model, 'weight') and model.weight.dim() > 1:
        nn.init.xavier_uniform_(model.weight.data)

# Функции обучения и валидации

In [8]:
import wandb

wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
wandb.init(project='text-translation-homework')

[34m[1mwandb[0m: Currently logged in as: [33marunovamargarita[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
def plot_losses(train_losses: List[float], val_losses: List[float]):
    """
    Plot loss and perplexity of train and validation samples
    :param train_losses: list of train losses at each epoch
    :param val_losses: list of validation losses at each epoch
    """
    clear_output()
    fig, axs = plt.subplots(1, 2, figsize=(13, 4))
    axs[0].plot(range(1, len(train_losses) + 1), train_losses, label='train', color='purple')
    axs[0].plot(range(1, len(val_losses) + 1), val_losses, label='val', color='darkgreen')
    axs[0].set_ylabel('loss')

    train_perplexities, val_perplexities = torch.exp(torch.tensor(train_losses)), torch.exp(torch.tensor(val_losses))
    axs[1].plot(range(1, len(train_perplexities) + 1), train_perplexities, label='train', color='purple')
    axs[1].plot(range(1, len(val_perplexities) + 1), val_perplexities, label='val', color='darkgreen')
    axs[1].set_ylabel('perplexity')

    for ax in axs:
        ax.set_xlabel('epoch')
        ax.legend()

    plt.show()

def print_metrics(epoch, time, train_loss, val_loss):
    print(f'Epoch: {epoch}')
    print(f'\tTraining epoch time: {time} seconds')
    print(f'\tTrain: loss - {train_loss:.3f}, perplexity - {math.exp(train_loss):10.3f}')
    print(f'\tValidation: loss - {val_loss:.3f}, perplexity - {math.exp(val_loss):10.3f}')

def training_epoch(model: TranslationModel, optimizer: torch.optim.Optimizer, criterion: nn.Module,
                   loader: DataLoader, tqdm_desc: str):
    """
    Process one training epoch
    :param model: translation model to train
    :param optimizer: optimizer instance
    :param criterion: loss function class
    :param loader: training dataloader
    :param tqdm_desc: progress bar description
    :return: running train loss
    """
    model.train()
    device = next(model.parameters()).device

    train_loss = 0.0
    for source, target in tqdm(loader, desc=tqdm_desc):
        source = source.to(device)
        target = target.to(device)

        # standard training process
        optimizer.zero_grad()
        output, _ = model(source, target[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)

        loss = criterion(output, target[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)  # to prevent exploding gradient effect
        optimizer.step()
        # add loss to total loss
        train_loss += loss.item() * source.shape[0]

    return train_loss / len(loader.dataset)

@torch.no_grad()
def validation_epoch(model: TranslationModel, criterion: nn.Module,
                     loader: DataLoader, tqdm_desc: str):
    """
    Process one validation epoch
    :param model: translation model to validate
    :param criterion: loss function class
    :param loader: validation dataloader
    :param tqdm_desc: progress bar description
    :return: validation loss
    """
    model.eval()
    device = next(model.parameters()).device

    val_loss = 0.0
    for source, target in tqdm(loader, desc=tqdm_desc):
        source = source.to(device)
        target = target.to(device)

        # standard training process
        output, _ = model(source, target[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)

        loss = criterion(output, target[:, 1:].contiguous().view(-1))
        # add loss to total loss
        val_loss += loss.item() * source.shape[0]

    return val_loss / len(loader.dataset)

def train(model: TranslationModel, 
          optimizer: torch.optim.Optimizer, criterion: nn.Module, scheduler: Optional[Any], 
          train_loader: DataLoader, val_loader: DataLoader, num_epochs: int):
    """
    Train language model for several epochs
    :param model: translation model to train
    :param optimizer: optimizer instance
    :param scheduler: optional scheduler
    :param train_loader: training dataloader
    :param val_loader: validation dataloader
    :param num_epochs: number of training epochs
    """
    train_losses, val_losses = [], []
    train_perplexity, val_perplexity = [], []
    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        start_time = time.time()
        train_loss = training_epoch(
            model, optimizer, criterion, train_loader,
            tqdm_desc=f'Training {epoch}/{num_epochs}'
        )
        val_loss = validation_epoch(
            model, criterion, val_loader,
            tqdm_desc=f'Validating {epoch}/{num_epochs}'
        )
        end_time = time.time()

        if scheduler is not None:
            scheduler.step()
            
        # save loss
        train_losses += [train_loss]
        val_losses += [val_loss]
        
        # log both metrics in wandb after train epoch
        wandb.log({'train loss': train_loss, 'validation loss': val_loss,
                   'train perplexity': math.exp(train_loss), 'validation perplexity': math.exp(val_loss)})

        # save best model
        if val_loss < best_val_loss:
            torch.save(model.state_dict(), 'translation-model.pt')
            best_val_loss = val_loss
        
        # save model after each 10 epoch
        if epoch % 10 == 0:
            torch.save(model.state_dict(), 'translation-model_' + str(epoch) + '.pt')
        
        # print metrics
        print_metrics(epoch, end_time - start_time, train_loss, val_loss)

# Фиксирование seed

In [11]:
RANDOM_SEED = 777

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

# Конфигурация модели и датасета

In [13]:
KAGGLE_DATASET_PREFIX = '/kaggle/input/translation-dataset/'
KAGGLE_OUTPUT_PREFIX = '/kaggle/working/'

SUFFIX_FILE_EN = '.de-en.en'
SUFFIX_FILE_DE = '.de-en.de'

ENCODER_INPUT_DIM = 200 # train_loader.dataset.vocab_size_source
DECODER_INPUT_DIM = 200 # train_loader.dataset.vocab_size_target
EMBEDDING_DIM = 256
ENCODER_NUM_LAYERS = 3
DECODER_NUM_LAYERS = 3
ENCODER_NUM_HEADS = 8
DECODER_NUM_HEADS = 8
ENCODER_FEEDFORWARD_DIM = 512
DECODER_FEEDFORWARD_DIM = 512
ENCODER_DROPOUT = 0.1
DECODER_DROPOUT = 0.1
MAX_LENGTH = 200

OPTIMIZER_LEARNING_RATE = 0.0005

# Функции подсчёта метрик и создания предсказания

In [40]:
def calculate_bleu(model, source_texts, target_texts):
    predicted_txts, target_txts = [], []
    for source, target in zip(source_texts, target_texts):
        prediction = model.inference(source)
        predicted_txts.append(prediction)
        target_txts.append([target])

    metric = evaluate.load('sacrebleu')
    score = metric.compute(predictions=predicted_txts, references=target_txts)
    return score['score'], predicted_txts, target_txts

def import_validation_texts():
    source_texts, target_texts = [], []
    with open(KAGGLE_DATASET_PREFIX + 'val' + SUFFIX_FILE_DE, encoding='utf8') as file:
        source_texts = file.readlines()
    with open(KAGGLE_DATASET_PREFIX + 'val' + SUFFIX_FILE_EN, encoding='utf8') as file:
        target_texts = file.readlines()
    return source_texts, target_texts

def import_train_texts():
    source_texts, target_texts = [], []
    with open(KAGGLE_DATASET_PREFIX + 'train' + SUFFIX_FILE_DE, encoding='utf8') as file:
        source_texts = file.readlines()
    with open(KAGGLE_DATASET_PREFIX + 'train' + SUFFIX_FILE_EN, encoding='utf8') as file:
        target_texts = file.readlines()
    return source_texts, target_texts

def save_validation_prediction(predicted_txts, target_txts):
    with open('pred.txt', 'w') as f:
        for line in predicted_txts:
            f.write(f"{line}\n")
    with open('target.txt', 'w') as f:
        for line in target_txts:
            f.write(f"{line}")

def make_prediction(model):
    source_texts, predicted_texts = [], []
    with open(KAGGLE_DATASET_PREFIX + 'test1' + SUFFIX_FILE_DE, encoding='utf8') as file:
        source_texts = file.readlines()
    
    for source in source_texts:
        prediction = model.inference(source)
        predicted_texts.append(prediction)
    
    with open('pred_test.txt', 'w') as f:
        for line in predicted_texts:
            f.write(f"{line}\n")
        
    return predicted_texts

# Обучение

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [17]:
train_set = TextDataset(
    data_file_en=KAGGLE_DATASET_PREFIX + 'train' + SUFFIX_FILE_EN, 
    data_file_de=KAGGLE_DATASET_PREFIX + 'train' + SUFFIX_FILE_DE, 
    sp_model_prefix=KAGGLE_DATASET_PREFIX+ 'experiment_1'
)
valid_set = TextDataset(
    data_file_en=KAGGLE_DATASET_PREFIX + 'val' + SUFFIX_FILE_EN, 
    data_file_de=KAGGLE_DATASET_PREFIX + 'val' + SUFFIX_FILE_DE, 
    sp_model_prefix=KAGGLE_DATASET_PREFIX + 'experiment_1'
)

In [18]:
train_loader = DataLoader(train_set, num_workers=2, shuffle=False, batch_size=128)
valid_loader = DataLoader(valid_set, num_workers=2, shuffle=False, batch_size=128)

In [19]:
ENCODER_INPUT_DIM = train_loader.dataset.vocab_size_source
DECODER_INPUT_DIM = train_loader.dataset.vocab_size_target

In [20]:
encoder = Encoder(
    input_dim=ENCODER_INPUT_DIM,
    embedding_dim=EMBEDDING_DIM,
    num_layers=ENCODER_NUM_LAYERS,
    num_heads=ENCODER_NUM_HEADS,
    feedforward_dim=ENCODER_FEEDFORWARD_DIM,
    dropout_prob=ENCODER_DROPOUT,
    max_length=MAX_LENGTH,
    device=device
)
decoder = Decoder(
    input_dim=DECODER_INPUT_DIM,
    embedding_dim=EMBEDDING_DIM,
    num_layers=DECODER_NUM_LAYERS,
    num_heads=DECODER_NUM_HEADS,
    feedforward_dim=DECODER_FEEDFORWARD_DIM,
    dropout_prob=DECODER_DROPOUT,
    max_length=MAX_LENGTH,
    device=device
)
model = TranslationModel(encoder, decoder, train_set, device).to(device)
model.apply(set_model_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=OPTIMIZER_LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=train_set.pad_id)

In [28]:
print(f'Model parameters number {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')

Model parameters number 6,363,064


In [30]:
NUM_EPOCH = 30

train(
    model=model, 
    optimizer=optimizer,
    criterion=criterion,
    scheduler=None,
    train_loader=train_loader,
    val_loader=valid_loader,
    num_epochs=NUM_EPOCH
)

Training 1/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 1/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 1
	Training epoch time: 522.8832960128784 seconds
	Train: loss - 4.231, perplexity -     68.792
	Validation: loss - 3.304, perplexity -     27.214


Training 2/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 2/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 2
	Training epoch time: 521.356502532959 seconds
	Train: loss - 3.136, perplexity -     23.022
	Validation: loss - 2.772, perplexity -     15.984


Training 3/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 3/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 3
	Training epoch time: 521.5803985595703 seconds
	Train: loss - 2.730, perplexity -     15.326
	Validation: loss - 2.474, perplexity -     11.876


Training 4/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 4/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 4
	Training epoch time: 521.0969927310944 seconds
	Train: loss - 2.494, perplexity -     12.105
	Validation: loss - 2.312, perplexity -     10.093


Training 5/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 5/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 5
	Training epoch time: 521.0384550094604 seconds
	Train: loss - 2.337, perplexity -     10.346
	Validation: loss - 2.181, perplexity -      8.857


Training 6/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 6/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 6
	Training epoch time: 520.6049497127533 seconds
	Train: loss - 2.221, perplexity -      9.217
	Validation: loss - 2.106, perplexity -      8.213


Training 7/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 7/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 7
	Training epoch time: 519.7407703399658 seconds
	Train: loss - 2.133, perplexity -      8.444
	Validation: loss - 2.036, perplexity -      7.657


Training 8/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 8/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 8
	Training epoch time: 519.9721374511719 seconds
	Train: loss - 2.065, perplexity -      7.881
	Validation: loss - 1.994, perplexity -      7.343


Training 9/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 9/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 9
	Training epoch time: 520.1948029994965 seconds
	Train: loss - 2.007, perplexity -      7.444
	Validation: loss - 1.951, perplexity -      7.037


Training 10/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 10/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 10
	Training epoch time: 519.938161611557 seconds
	Train: loss - 1.961, perplexity -      7.104
	Validation: loss - 1.921, perplexity -      6.825


Training 11/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 11/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 11
	Training epoch time: 519.4557843208313 seconds
	Train: loss - 1.921, perplexity -      6.827
	Validation: loss - 1.896, perplexity -      6.656


Training 12/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 12/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 12
	Training epoch time: 519.4673662185669 seconds
	Train: loss - 1.886, perplexity -      6.592
	Validation: loss - 1.875, perplexity -      6.519


Training 13/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 13/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 13
	Training epoch time: 520.1328158378601 seconds
	Train: loss - 1.856, perplexity -      6.399
	Validation: loss - 1.851, perplexity -      6.369


Training 14/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 14/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 14
	Training epoch time: 518.9133441448212 seconds
	Train: loss - 1.830, perplexity -      6.231
	Validation: loss - 1.839, perplexity -      6.290


Training 15/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 15/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 15
	Training epoch time: 519.4365215301514 seconds
	Train: loss - 1.806, perplexity -      6.087
	Validation: loss - 1.821, perplexity -      6.177


Training 16/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 16/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 16
	Training epoch time: 519.5117321014404 seconds
	Train: loss - 1.785, perplexity -      5.958
	Validation: loss - 1.806, perplexity -      6.084


Training 17/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 17/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 17
	Training epoch time: 519.1320376396179 seconds
	Train: loss - 1.765, perplexity -      5.843
	Validation: loss - 1.787, perplexity -      5.970


Training 18/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 18/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 18
	Training epoch time: 518.8865311145782 seconds
	Train: loss - 1.748, perplexity -      5.742
	Validation: loss - 1.778, perplexity -      5.916


Training 19/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 19/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 19
	Training epoch time: 518.8149254322052 seconds
	Train: loss - 1.732, perplexity -      5.649
	Validation: loss - 1.769, perplexity -      5.863


Training 20/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 20/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 20
	Training epoch time: 518.7743580341339 seconds
	Train: loss - 1.717, perplexity -      5.568
	Validation: loss - 1.760, perplexity -      5.812


Training 21/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 21/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 21
	Training epoch time: 519.0159680843353 seconds
	Train: loss - 1.703, perplexity -      5.491
	Validation: loss - 1.751, perplexity -      5.758


Training 22/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 22/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 22
	Training epoch time: 518.9846410751343 seconds
	Train: loss - 1.690, perplexity -      5.421
	Validation: loss - 1.741, perplexity -      5.704


Training 23/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 23/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 23
	Training epoch time: 519.5861458778381 seconds
	Train: loss - 1.679, perplexity -      5.361
	Validation: loss - 1.736, perplexity -      5.677


Training 24/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 24/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 24
	Training epoch time: 520.4067349433899 seconds
	Train: loss - 1.668, perplexity -      5.299
	Validation: loss - 1.729, perplexity -      5.635


Training 25/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 25/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 25
	Training epoch time: 519.5121397972107 seconds
	Train: loss - 1.657, perplexity -      5.243
	Validation: loss - 1.721, perplexity -      5.591


Training 26/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 26/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 26
	Training epoch time: 519.3693881034851 seconds
	Train: loss - 1.648, perplexity -      5.198
	Validation: loss - 1.718, perplexity -      5.574


Training 27/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 27/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 27
	Training epoch time: 519.1377968788147 seconds
	Train: loss - 1.639, perplexity -      5.149
	Validation: loss - 1.714, perplexity -      5.549


Training 28/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 28/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 28
	Training epoch time: 519.2391338348389 seconds
	Train: loss - 1.630, perplexity -      5.102
	Validation: loss - 1.707, perplexity -      5.514


Training 29/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 29/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 29
	Training epoch time: 519.316118478775 seconds
	Train: loss - 1.622, perplexity -      5.061
	Validation: loss - 1.703, perplexity -      5.488


Training 30/30:   0%|          | 0/1531 [00:00<?, ?it/s]

Validating 30/30:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 30
	Training epoch time: 519.351496219635 seconds
	Train: loss - 1.614, perplexity -      5.023
	Validation: loss - 1.702, perplexity -      5.484


In [31]:
model.load_state_dict(torch.load('translation-model.pt'))

<All keys matched successfully>

# Подсчёт BLEU и получение предсказания

In [32]:
source_texts, target_texts = import_validation_texts()

In [37]:
bleu_score, predicted_txts, target_txt = calculate_bleu(model, source_texts, target_texts)
print(f'BLEU score = {bleu_score}')

BLEU score = 30.09090239395271


In [38]:
save_validation_prediction(predicted_txts, target_texts)

In [39]:
make_prediction(model) # 27.35

['and that worked . applause for scott .',
 "and over the course of the next eight or nine days , you 're going to get your game .",
 'thank you very much .',
 'now we have " three-performing " -- and the people are always going back to the back of the day , for a bicycle steel , for the fuel , little good , rather than giving them the things back to the victims .',
 'we have a cognitive mistake so that we can take a perfect disease process for a patient with a chest pain .',
 'so roman speakers realized their language -- not word , the word , the word , the only thing they do , but the subject matter .',
 'and what we found was that there was actually a relationship .',
 "and my little launch propagation is that we 're going to force ourselves to the environment by --",
 "and there 's a generation that has grown up with the internet , and the knowledge that it 's not so hard to do things together , you just have to build the systems .",
 'it was about a way of finding out what the par