## Импорт необходимых зависимостей

In [1]:
import pandas as pd
import numpy as np

import nltk

import torch
import torch.nn as nn
import torch.optim

import pickle
import pathlib
import os

# from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, Dataset

from sklearn import model_selection

from pprint import pprint
from random import choice
from typing import List, Union
from collections import Counter
from itertools import chain

from tqdm.auto import tqdm

from transformers import BertTokenizerFast

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
nltk.download('punkt', quiet=True);

## Подготовка данных

In [4]:
RANDOM_STATE = 1

In [5]:
df = pd.read_csv('../input/lenta-dataset/dataset.csv')
df = df.sample(frac=0.5, random_state=RANDOM_STATE)

In [6]:
df.gender.replace('undefined', 'undefined_g', inplace=True)
df.number.replace('undefined', 'undefined_n', inplace=True)

In [7]:
# df = df.sample(n=5000, random_state=RANDOM_STATE)

### Разбиение данных на обучающие, тестовые и валидационные

In [8]:
train_df, test_df = model_selection.train_test_split(df, train_size=0.9, shuffle=True, random_state=RANDOM_STATE)
test_df, val_df = model_selection.train_test_split(test_df, test_size=0.5, shuffle=True, random_state=RANDOM_STATE)

### Подготовка словаря

In [9]:
batch_size = 40

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

In [11]:
special_tokens = {
    'bos_token': '[BOS]',
    'eos_token': '[EOS]',
    'masc': '[MASC_G]',
    'fem': '[FEM_G]',
    'neut': '[NEUT_G]',
    'undefined_g': '[UNDEF_G]',
    'past': '[PAST_T]',
    'pres': '[PRES_T]',
    'fut': '[FUT_T]',
    'sing': '[SING_N]',
    'plur': '[PLUR_N]',
    'undefined_n': '[UNDEF_N]'
}

In [12]:
tokenizer.add_special_tokens({'bos_token': special_tokens['bos_token'],
                              'eos_token': special_tokens['eos_token'],
                              'additional_special_tokens': list(special_tokens.values())[2:]})

12

### Разбиение данных на батчи

In [13]:
def make_batched_dataset(df, tokenizer=tokenizer, batch_size=batch_size):
    n_batches = len(df) // batch_size
    
    for n_batch in range(n_batches):
        
        orig_texts   = df.orig_texts.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        lemm_texts   = df.part_lemm_texts.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        nsubj_list   = df.nsubj.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        gender_list  = df.gender.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        tense_list   = df.tense.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        number_list  = df.number.to_list()[batch_size * n_batch:batch_size * (n_batch + 1)]
        
        bos_token = tokenizer.bos_token
        eos_token = tokenizer.eos_token
        
        inputs = zip(lemm_texts, nsubj_list, gender_list, tense_list, number_list)
        
        inputs = [f'{nsubj} {special_tokens[gender]} {special_tokens[tense]} {special_tokens[number]} {bos_token} {lemm} {eos_token}'
                  for lemm, nsubj, gender, tense, number in inputs]
        
        inputs = tokenizer(inputs, add_special_tokens=False, padding='longest',
                           return_tensors='pt', return_attention_mask=False, return_token_type_ids=False).input_ids
        
        targets = [f'{bos_token} {orig} {eos_token}' for orig in orig_texts]
        
        targets = tokenizer(targets, add_special_tokens=False, padding='longest',
                            return_tensors='pt', return_attention_mask=False, return_token_type_ids=False).input_ids
        
        yield inputs.permute(1, 0), targets.permute(1, 0)
        

In [14]:
train_n_batches = len(train_df) // batch_size
val_n_batches = len(val_df) // batch_size
test_n_batches = len(test_df)

In [15]:
def save_processed_data(train_data, val_data, test_data):
    path = {
        'dir': './',
        'name': 'processed_data_rbt.pkl'
    }
    
    try:
        pathlib.Path(path['dir']).mkdir(exist_ok=True)
        file_path = path['dir'] + '/' + path['name']

        with open(file_path, 'wb') as f:
            pickle.dump((train_data, val_data, test_data), f)

        print(f'Data is saved successfully at {file_path}')

    except Exception as e:
        print(f'Failed to save data due to:\n{e}')

In [16]:
def load_processed_data(path='../input/transformer-vocab/processed_data_rbt.pkl'):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)

        print(f'Data is loaded successfully from {path}')

        return data

    except Exception as e:
        print(f'Failed to load data due to:\n{e}')

        return [None] * 3

In [17]:
# train_data = make_batched_dataset(train_df)
# val_data = make_batched_dataset(val_df)
# test_data = []
# for i, batch in enumerate(tqdm(make_batched_dataset(test_df, batch_size=1), desc='Unpacking test batches', total=500)):
#     test_data.append(batch)
#     if i == 500:
#         break
# test_data = [batch for batch in tqdm(make_batched_dataset(test_df, batch_size=1), desc='Unpacking test batches', total=test_n_batches)]

In [18]:
load_data = True
save_data = False

if load_data:
    train_data, val_data, test_data = load_processed_data()

if not load_data or train_data is None:
    train_data = [batch for batch in tqdm(make_batched_dataset(train_df), desc='Unpacking train batches', total=train_n_batches)]
    val_data = [batch for batch in tqdm(make_batched_dataset(val_df), desc='Unpacking validation batches', total=val_n_batches)]
    test_data = [batch for batch in tqdm(make_batched_dataset(test_df, batch_size=1), desc='Unpacking test batches', total=test_n_batches)]

if save_data:
    save_processed_data(train_data, val_data, test_data)

Data is loaded successfully from ../input/transformer-vocab/processed_data_rbt.pkl


In [19]:
class BatchedDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [20]:
train_data = DataLoader(BatchedDataset(train_data), batch_size=None, shuffle=True)
val_data = DataLoader(BatchedDataset(val_data), batch_size=None, shuffle=False)

## Определение модели

### Определение класса модели

In [21]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, embedding_size, nhead,
                 num_encoder_layers, num_decoder_layers, 
                 dim_feedforward, dropout, vocab_size,
                 max_seq_len, pad_token_id, device):
        
        super(Seq2SeqTransformer, self).__init__()
        
        self.pad_token_id = pad_token_id
        self.max_seq_len = max_seq_len
        self.embedding_size = embedding_size
        
        self.to(device)
        
        self.device = device
        
        self.word_embedding = nn.Embedding(vocab_size, embedding_size, pad_token_id)
        self.input_pos_encoding = nn.Embedding(max_seq_len, embedding_size)
        self.target_pos_encoding = nn.Embedding(max_seq_len, embedding_size)
        
        self.transformer = nn.Transformer(embedding_size, nhead, num_encoder_layers,
                                          num_decoder_layers, dim_feedforward, dropout)
        
        self.fc_out = nn.Linear(embedding_size, vocab_size)
        
    def get_padding_mask(self, input):
        # input shape: (seq_len, batch_size)
        padding_mask = input.permute(1, 0) == self.pad_token_id
        return padding_mask.to(self.device)
    
    def forward(self, input, target):
        # input shape: (input_seq_len, batch_size)
        # target shape: (target_seq_len, batch_size)
    
        embedded_input = self.word_embedding(input)
        embedded_target = self.word_embedding(target)
        # embedded_input shape: (input_seq_len, batch_size, embedding_size)
        # embedded_target shape: (target_seq_len, batch_size, embedding_size)
        
        batch_size = input.shape[1]
        
        input_seq_len = input.shape[0]
        target_seq_len = target.shape[0]
    
        input_positions = torch.arange(0, input_seq_len).unsqueeze(1).expand(input_seq_len, batch_size).to(self.device)
        target_positions = torch.arange(0, target_seq_len).unsqueeze(1).expand(target_seq_len, batch_size).to(self.device)
        # input_positions shape: (input_seq_len, batch_size)
        # target_positions shape: (target_seq_len, batch_size)
        
        input_positions = self.input_pos_encoding(input_positions)
        target_positions = self.target_pos_encoding(target_positions)
        # input_positions shape: (input_seq_len, batch_size, embedding_size)
        # target_positions shape: (target_seq_len, batch_size, embedding_size)
        
        embedded_input += input_positions
        embedded_target += target_positions
        
        input_padding_mask = self.get_padding_mask(input)
        target_padding_mask = self.get_padding_mask(target)
        # input_padding_mask shape: (batch_size, input_seq_len)
        
        target_mask = self.transformer.generate_square_subsequent_mask(target_seq_len).to(self.device)
        # target_mask shape: (target_seq_len, target_seq_len)
        
        output = self.transformer(embedded_input, embedded_target,
                                  tgt_mask=target_mask,
                                  src_key_padding_mask=input_padding_mask,
                                  tgt_key_padding_mask=target_padding_mask)
        # output shape: (target_seq_len, batch_size, embedding_size)
        
        output = self.fc_out(output)
        # output shape: (target_seq_len, batch_size, vocab_size)
        
        return output

## Определение функций-утилит

### Сохранение модели

In [22]:
def save_model(model, optimizer, epoch, val_loss, train_loss, path='./seq2seq_transformer.model'):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'val_loss': val_loss,
        'train_loss': train_loss
    }
    
    torch.save(checkpoint, path)
    print(f'\n\tModel saved successfully at {path}\n')

### Загрузка модели

In [23]:
def load_model(model, optimizer=None, path='../input/transformer-vocab/seq2seq_transformer.model', device=torch.device('cpu')):
    checkpoint = torch.load(path, map_location=device)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        
    epoch      = checkpoint['epoch']
    val_loss   = checkpoint['val_loss']
    train_loss = checkpoint['train_loss']

    return {'epoch': epoch, 'val_loss': val_loss, 'train_loss': train_loss}

## Место хранения модели

In [24]:
model_path = {
    'dir': './',
    'name': 'seq2seq_transformer.model'
}

## Обучение модели

### Определение параметров обучения

In [25]:
learning_params = {
    'learning_rate': 5e-05,
    'epochs': 10,
    'max_norm': 1.0,
    'patience': 3
}

### Определение параметров сети

In [26]:
params = {
    'embedding_size': 512,
    'nhead': 8,
    'num_encoder_layers': 4,
    'num_decoder_layers': 4,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'vocab_size': len(tokenizer.get_vocab()),
    'max_seq_len': 110,
    'pad_token_id': tokenizer.pad_token_id,
    'device': torch.device('cuda')
}

### Инициализация модели, оптимизатора и функции потерь

In [27]:
model = Seq2SeqTransformer(**params).to(params['device'])

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_params['learning_rate'])

In [29]:
criterion = nn.CrossEntropyLoss(ignore_index=params['pad_token_id'])

In [30]:
load_pretrained_model = True
train_state = None

if load_pretrained_model:
    try:
        train_state = load_model(
            model, optimizer,
            '../input/transformer-vocab/seq2seq_transformer.model',
            params['device']
        )
        print(f"Model loaded successfully from ../input/transformer-vocab/seq2seq_transformer.model")
    
    except Exception as e:
        print(f'Load failed due to:\n{e}')

epoch = train_state['epoch'] if train_state is not None else 0

Model loaded successfully from ../input/transformer-vocab/seq2seq_transformer.model


In [31]:
# train_loss_writer = SummaryWriter('./runs/loss')

In [32]:
def evaluate(test_sample, tokenizer, max_seq_len, device):
    predictions = [tokenizer.bos_token_id]
    for i in range(max_seq_len):
        target = torch.tensor(predictions, device=device).unsqueeze(1)
        
        output = model(test_sample.to(device), target)
        best_prediction = output.argmax(2)[-1].item()
        predictions.append(best_prediction)
        
        if best_prediction == tokenizer.eos_token_id:
            break
    
    decoded_output = tokenizer.decode(predictions, return_tokenized=False)
    return decoded_output

### Train-скрипт

In [33]:
def train(
    model, optimizer, criterion,
    train_data, val_data, test_data,
    epochs, max_norm, patience, current_epoch,
    device, tokenizer, model_path, max_seq_len, n_prints=10
):
    
    min_mean_val_loss = float('+inf')
    initial_patience = patience
    print_every = train_n_batches // n_prints
    
#     train_samples = []
#     val_samples = []
#     make_data_loaders = True
    
    for epoch in tqdm(range(current_epoch, epochs), 'Epochs'):
#         try:
        running_train_loss = 0.0
        print(f'\nEpoch [{epoch} / {epochs}]')

        model.train()
        tqdm_iter_batch = tqdm(train_data, desc='Training iterations', total=train_n_batches)
        for iteration, (input, target) in enumerate(tqdm_iter_batch):
#             train_samples.append((input, target))

            input  = input.to(device)
            target = target.to(device)
            # input shape : (input_seq_len, batch_size)
            # target shape: (target_seq_len, batch_size)

            optimizer.zero_grad()

            output = model(input, target[:-1])
            # output shape: (target_seq_len, batch_size, vocab_size)

            vocab_size = output.shape[2]

            output = output.reshape(-1, vocab_size)
            # output shape: (target_seq_len * batch_size, vocab_size)

            target = target[1:].reshape(-1)
            # target shape: (target_seq_len * batch_size)

            loss = criterion(output, target)

            running_train_loss += loss.item()

            tqdm_iter_batch.set_postfix({'train_loss': loss.item()})

            loss.backward()

#             global_step = epoch * (len(train_data) + 1) + iteration
#             train_loss_writer.add_scalar('Training loss', loss, global_step=global_step)

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)

            optimizer.step()

            if iteration % print_every == 0:
                mean_train_loss = running_train_loss / print_every if iteration != 0 else running_train_loss
                running_train_loss = 0
                print(f'\tIteration #{iteration}: training loss = {mean_train_loss}\n')
                
                if iteration != 0:
                    save_model(model, optimizer, epoch, -1, mean_train_loss)
                
                    test_sample = choice(test_data)

                    decoded_output = evaluate(test_sample[0], tokenizer, max_seq_len, device)
                    decoded_input  = tokenizer.decode(test_sample[0].squeeze(1))
                    decoded_target = tokenizer.decode(test_sample[1].squeeze(1))

                    print(f'\tInput : {decoded_input}')
                    print(f'\tOutput: {decoded_output}')
                    print(f'\tTarget: {decoded_target}')


        with torch.no_grad():
            model.eval()

            val_loss = []

            for iteration, (input, target) in enumerate(tqdm(val_data, desc='Validating iterations', total=val_n_batches)):
#                 val_samples.append((input, target))

                input  = input.to(device)
                target = target.to(device)

                output = model(input, target[:-1])
                vocab_size = output.shape[2]
                output = output.reshape(-1, vocab_size)

                target = target[1:].reshape(-1)

                local_val_loss = criterion(output, target)
                val_loss.append(local_val_loss.item())

            mean_val_loss = sum(val_loss) / len(val_loss)
            print(f'\tValidation loss = {mean_val_loss}')

            if mean_val_loss < min_mean_val_loss:
                try:
                    save_model(model, optimizer, epoch, mean_val_loss, loss)
                    min_mean_val_loss = mean_val_loss
                    patience = initial_patience
                except Exception as e:
                    print(f'Model saving failed due to unhandled exception:\n{e}')
            else:
                patience -= 1


#                 if make_data_loaders:
#                     train_data = DataLoader(BatchedDataset(train_samples), batch_size=None, shuffle=True)
#                     val_data = DataLoader(BatchedDataset(val_samples), batch_size=None, shuffle=False)
#                     make_data_loaders = False


            test_sample = choice(test_data)

            decoded_output = evaluate(test_sample[0], tokenizer, max_seq_len, device)
            decoded_input  = tokenizer.decode(test_sample[0].squeeze(1))
            decoded_target = tokenizer.decode(test_sample[1].squeeze(1))

            print(f'\tInput : {decoded_input}')
            print(f'\tOutput: {decoded_output}')
            print(f'\tTarget: {decoded_target}')

        if patience == 0:
            print(f'\nModel learning finished due to early stopping')
            break
#         except Exception as e:
#             print(f'Learning stopped due to unhandled exception:\n{e}')
#             save_model(model, optimizer, epoch, mean_val_loss, loss)

In [None]:
train(
    model, optimizer, criterion,
    train_data, val_data, test_data,
    learning_params['epochs'], learning_params['max_norm'],
    learning_params['patience'], epoch, params['device'], tokenizer,
    model_path['dir'] + model_path['name'], params['max_seq_len']
)

Epochs:   0%|          | 0/7 [00:00<?, ?it/s]


Epoch [3 / 10]


Training iterations:   0%|          | 0/20651 [00:00<?, ?it/s]

	Iteration #0: training loss = 0.1401817798614502

	Iteration #2065: training loss = 0.20392846738960205


	Model saved successfully at ./seq2seq_transformer.model

	Input : эксперты [MASC_G] [PRES_T] [PLUR_N] [BOS] об это говориться в обзор компании " азбука жилье ". эксперт отмечают, что в ноябре рынок продемонстрировать традиционныи рост покупательскии активность : незначительно, но вырасти потенциальныи спрос, увеличиться число реальныи сделка. [EOS]
	Output: [BOS] об этом говорится в обзоре компании " азбука жилья ". эксперты отмечают, что в ноябре рынок продемонструют традиционныи рост покупательскои активности : незначительно, но вырасти потенциальныи спрос, увеличатся число реальных сделок. [EOS]
	Target: [BOS] об этом говорится в обзоре компании " азбука жилья ". эксперты отмечают, что в ноябре рынок продемонстрировал традиционныи рост покупательскои активности : незначительно, но вырос потенциальныи спрос, увеличилось число реальных сделок. [EOS]
	Iteration #4130: training lo

Validating iterations:   0%|          | 0/1147 [00:00<?, ?it/s]

	Validation loss = 0.18644570824937812

	Model saved successfully at ./seq2seq_transformer.model

	Input : полузащитник [MASC_G] [PAST_T] [SING_N] [BOS] первыи мяч прямым ударом с штрафного на 73 - и минуте забить полузащитник « три лев » эрик даиер. [EOS]
	Output: [BOS] первыи мяч прямым ударом со штрафного на 73 - и минуте забил полузащитник « три львов » эрик даиер. [EOS]
	Target: [BOS] первыи мяч прямым ударом со штрафного на 73 - и минуте забил полузащитник « трех львов » эрик даиер. [EOS]

Epoch [4 / 10]


Training iterations:   0%|          | 0/20651 [00:00<?, ?it/s]

	Iteration #0: training loss = 0.1354968100786209

	Iteration #2065: training loss = 0.16903956515520593


	Model saved successfully at ./seq2seq_transformer.model

	Input : националист [MASC_G] [PAST_T] [SING_N] [BOS] эстонскии националист юри бем, которыи некоторыи время назад быть исключить из каитселиит ( военизированныи формирование, состоящии из доброволец ), добиться отмена это решение, сообщать delfi. [EOS]
	Output: [BOS] эстонскии националист юри бем, которыи некоторое время назад был исключен из каитселиита ( военизированные формирования, состоящии из добровольцев ), добился отмены этого решения, сообщает delfi. [EOS]
	Target: [BOS] эстонскии националист юри бем, которыи некоторое время назад был исключен из каитселиита ( военизированного формирования, состоящего из добровольцев ), добился отмены этого решения, сообщает delfi. [EOS]
	Iteration #4130: training loss = 0.1712283521241195


	Model saved successfully at ./seq2seq_transformer.model

	Input : мужчина [MASC_G] [PAST_

Validating iterations:   0%|          | 0/1147 [00:00<?, ?it/s]

	Validation loss = 0.16834916819435883

	Model saved successfully at ./seq2seq_transformer.model

	Input : участники [MASC_G] [PAST_T] [PLUR_N] [BOS] их участник обсуждали результаты социологическии опрос, также быть озвучить фамилия другои возможныи кандидат. [EOS]
	Output: [BOS] их участники обсуждали результаты социологических опросов, также были озвучены фамилии других возможных кандидатов. [EOS]
	Target: [BOS] их участники обсуждали результаты социологических опросов, также были озвучены фамилии других возможных кандидатов. [EOS]

Epoch [5 / 10]


Training iterations:   0%|          | 0/20651 [00:00<?, ?it/s]

	Iteration #0: training loss = 0.1615220159292221

	Iteration #2065: training loss = 0.14302991716682767


	Model saved successfully at ./seq2seq_transformer.model

	Input : танигути [FEM_G] [PAST_T] [SING_N] [BOS] 43 - летнии танигути совершать восхождение на скалу высота 1984 метра вместе с четырьмя альпинист. [EOS]
	Output: [BOS] 43 - летняя танигути совершала восхождение на скалу высотах 1984 метра вместе с четырьмя альпинистами. [EOS]
	Target: [BOS] 43 - летняя танигути совершала восхождение на скалу высотои 1984 метра вместе с четырьмя альпинистами. [EOS]
	Iteration #4130: training loss = 0.14424578313677422


	Model saved successfully at ./seq2seq_transformer.model

	Input : представители [MASC_G] [PAST_T] [PLUR_N] [BOS] представитель движение сопротивление африканер ( awb ) отказались мстить за свои лидер юджина тербланша ( eugene terreblanche ), убитого на прошлои неделе в юар. [EOS]
	Output: [BOS] представители движения сопротивления африканских ( awb ) отказались мстить за с