In [38]:
#!pip install transformers==4.2.2

In [2]:
!nvidia-smi

Mon May 30 12:39:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01   Driver Version: 450.172.01   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM3...  On   | 00000000:59:00.0 Off |                    0 |
| N/A   42C    P0    53W / 350W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import pandas as pd
from sklearn.utils import shuffle

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

2022-05-30 12:39:28.269807: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [5]:
import optuna

In [6]:
train_df = pd.read_csv('../data/markup2train.csv')
test_df = pd.read_csv('../data/markup2test.csv')
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,index,path,original,new,var
0,118,131,0024c525-9f48-4e21-8eec-d00aea56deaa,В целях продвижения нового канала продаж «Цифр...,Цель продвижения канала продаж «Цифровой офис...,4
1,1119,1184,00e3fe8f-114e-4b83-a15f-aa7af38f931a,в разрезе центров прибыли верхнего уровня:,по прибыли верхнего уровня:,1
2,1033,1123,00e19c69-1d75-4459-b4e3-8256a6fbdaf3,в срок до 16.12.2021 включительно сформировать...,до 16.12.2021 определить победителей Акции;,4


In [7]:
rephrase_token = '<DECANC>'

In [8]:
train_df['original'] = train_df['original'] + rephrase_token
test_df['original'] = test_df['original'] + rephrase_token

In [9]:
train_df['original'].iloc[1]

'в разрезе центров прибыли верхнего уровня:<DECANC>'

In [10]:
train_df=train_df[train_df['new'].isna()==False].reset_index()
train_df.shape

(5841, 7)

In [11]:
class GPTDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, x, y):
        self.tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        inp = self.x['input_ids'][idx] + self.y['input_ids'][idx]
        attention_mask = self.x['attention_mask'][idx] + self.y['attention_mask'][idx]
        labels = [self.tokenizer.pad_token_id] * len(self.x['input_ids'][idx]) + self.y['input_ids'][idx]
        item = {"input_ids": inp, "attention_mask" : attention_mask, "labels": labels}
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2

In [12]:
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange

In [13]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['attention_mask']},
            padding=True,
        ) 
        # ybatch['input_ids'][ybatch['input_ids'] == tokenizer.pad_token_id] = -100
        batch['labels'] = ybatch['input_ids']
        return {k: torch.tensor(v) for k, v in batch.items()}

In [14]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [15]:
def evaluate_model(model, tokenizer, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            batch['labels'][batch['labels']==tokenizer.pad_token_id] = -100
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

In [16]:
from copy import deepcopy

ce = torch.nn.CrossEntropyLoss(reduction='none')


def train_loop(
    model, tokenizer, train_dataloader, val_dataloader, 
    lr=3e-5,
    weight_decay=1e-2,
    max_epochs=10,
    gradient_accumulation_steps=1, 
    early_stop_round=2
):
    cleanup()
    print(f"LR = {lr}, weight_decay={weight_decay}")
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay)

    ewm_loss = 0
    step = 0
    best_model = None
    model.train()
    best_eval = float('inf')
    es = 0
    for epoch in trange(max_epochs):
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
            try:
                batch['labels'][batch['labels']==tokenizer.pad_token_id] = -100
                loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
#                 output = model(**{k: v.to(model.device) for k, v in batch.items()})
#                 output = model(**batch)
#                 logits = output.logits
#                 labels = batch['labels']
#                 ce_result = ce(logits.permute(0, 2, 1), labels)
#                 probs = (torch.nn.functional.one_hot(labels, logits.shape[2]) * torch.nn.functional.softmax(logits, dim=2)).sum(2)
#                 loss = (((1 - probs) ** 2) * ce_result).mean()
                loss.backward()
            except Exception as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue
#             if i and i % gradient_accumulation_steps == 0:
            optimizer.step()
            for p in model.parameters():
                p.grad = None
#             optimizer.zero_grad()
            step += 1

            ewm_loss = loss.item()
            tq.set_description(f'loss: {ewm_loss:4.4f}')

        model.eval()
        eval_loss = evaluate_model(model, tokenizer, val_dataloader)
        model.train()
        print(f'epoch {epoch}, step {i}/{step}: train loss: {ewm_loss:4.4f}  val loss: {eval_loss:4.4f}')
        if eval_loss < best_eval:
            es = 0
            best_eval = eval_loss
            best_model = deepcopy(model)
            print("New best pretrain_iteration")
        else:
            es += 1
        if es == early_stop_round:
            print("Early Stop!")
            break
        cleanup()
    cleanup()
    return best_eval, best_model

In [17]:
def train_model(x, y, model_name, test_size=0.1, batch_size=32, **kwargs):
    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tokenizer.add_special_tokens(
        {'additional_special_tokens': [rephrase_token], 'pad_token': '[PAD]'}
    )
    model.resize_token_embeddings(len(tokenizer))
    

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = GPTDataset(tokenizer, tokenizer(x1), tokenizer(y1))
    test_dataset = GPTDataset(tokenizer, tokenizer(x2), tokenizer(y2))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator, pin_memory=True)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator, pin_memory=True)

    eval_loss, best_model = train_loop(model, tokenizer, train_dataloader, val_dataloader, **kwargs)
    return eval_loss

In [19]:
datasets = {
    'train': train_df[['original', 'new']],
    'test': test_df[['original', 'new']]
}

In [20]:
model_name = 'sberbank-ai/rugpt3large_based_on_gpt2'

In [21]:
import optuna

In [22]:
def objective(trial):
    print(f'Trial {trial.number} is started')
    lr = trial.suggest_float("lr", 1e-7, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1, log=True)
    seed = 0
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    d = datasets['train']
    dname = 'train'
    print(f'\n\n\n  train \n=====================\n\n')
    eval_loss = train_model(d['original'].tolist(),
                            d['new'].tolist(),
                            model_name=model_name,
                            batch_size=16,
                            lr=lr,
                            weight_decay=weight_decay,
                            )
    return eval_loss

In [140]:
study.best_trial

FrozenTrial(number=55, values=[4.286579264534844], datetime_start=datetime.datetime(2022, 5, 22, 0, 15, 29, 157451), datetime_complete=datetime.datetime(2022, 5, 22, 0, 20, 55, 638675), params={'lr': 9.096840128561833e-05, 'weight_decay': 0.09598965983303268}, distributions={'lr': LogUniformDistribution(high=0.01, low=1e-06), 'weight_decay': LogUniformDistribution(high=0.1, low=1e-07)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=55, state=TrialState.COMPLETE, value=None)

In [143]:
# import pickle

# with open('optuna_history_t5.pkl', 'wb') as f:
#     pickle.dump(study, f)

In [25]:
import pickle

with open('optuna_history_t5.pkl', 'rb') as f:
    study = pickle.load(f)

In [26]:
study.best_params

{'lr': 9.096840128561833e-05, 'weight_decay': 0.09598965983303268}

In [27]:
def get_best_model(x, y, model_name, test_size=0.1, batch_size=32, **kwargs):
    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tokenizer.add_special_tokens(
        {'additional_special_tokens': [rephrase_token], 'pad_token': '[PAD]'}
    )
    model.resize_token_embeddings(len(tokenizer))
    

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = GPTDataset(tokenizer, tokenizer(x1), tokenizer(y1))
    test_dataset = GPTDataset(tokenizer, tokenizer(x2), tokenizer(y2))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator, pin_memory=True)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator, pin_memory=True)

    eval_loss, best_model = train_loop(model, tokenizer, train_dataloader, val_dataloader, **kwargs)
    return best_model

In [None]:
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
d = datasets['train']
model = get_best_model(d['original'].tolist(),
                        d['new'].tolist(),
                        model_name=model_name,
                        batch_size=16,
                        lr=# use T5 lr,
                        weight_decay=# use T5 wd,
                        )

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


LR = 9.096840128561833e-05, weight_decay=0.09598965983303268


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

error on step 0 CUDA out of memory. Tried to allocate 34.00 MiB (GPU 0; 31.75 GiB total capacity; 18.88 GiB already allocated; 4.25 MiB free; 19.00 GiB reserved in total by PyTorch)
error on step 2 CUDA out of memory. Tried to allocate 1.20 GiB (GPU 0; 31.75 GiB total capacity; 16.56 GiB already allocated; 1.15 GiB free; 17.85 GiB reserved in total by PyTorch)
Exception raised from malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f374783f1e2 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7f3747a9564b in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7f3747a96464 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7f3747a96aa1 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_

  0%|          | 0/329 [00:00<?, ?it/s]

error on step 1 CUDA out of memory. Tried to allocate 1.21 GiB (GPU 0; 31.75 GiB total capacity; 17.21 GiB already allocated; 956.25 MiB free; 18.07 GiB reserved in total by PyTorch)
Exception raised from malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f374783f1e2 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7f3747a9564b in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7f3747a96464 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7f3747a96aa1 in /home/user/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x11e (0x7f374a7bd52e in /home/user/conda/lib/python3.

  0%|          | 0/329 [00:00<?, ?it/s]

error on step 2 CUDA out of memory. Tried to allocate 1.51 GiB (GPU 0; 31.75 GiB total capacity; 18.47 GiB already allocated; 74.25 MiB free; 18.93 GiB reserved in total by PyTorch)
error on step 6 CUDA out of memory. Tried to allocate 1.32 GiB (GPU 0; 31.75 GiB total capacity; 15.57 GiB already allocated; 1.18 GiB free; 17.82 GiB reserved in total by PyTorch)
error on step 9 CUDA out of memory. Tried to allocate 1.48 GiB (GPU 0; 31.75 GiB total capacity; 17.97 GiB already allocated; 386.25 MiB free; 18.62 GiB reserved in total by PyTorch)
error on step 18 CUDA out of memory. Tried to allocate 196.00 MiB (GPU 0; 31.75 GiB total capacity; 17.94 GiB already allocated; 120.25 MiB free; 18.88 GiB reserved in total by PyTorch)
error on step 21 CUDA out of memory. Tried to allocate 294.00 MiB (GPU 0; 31.75 GiB total capacity; 18.04 GiB already allocated; 232.25 MiB free; 18.78 GiB reserved in total by PyTorch)
error on step 36 CUDA out of memory. Tried to allocate 1.33 GiB (GPU 0; 31.75 GiB 

In [30]:
torch.save(model,'../models/gpt_optuna_chekpoint.pt')

In [27]:
base_model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
# model_name = 't5_base_train_300'

In [33]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(
    {'additional_special_tokens': [rephrase_token], 'pad_token': '[PAD]'}
)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


2

In [34]:
def paraphrase(text, model, n=None, max_length='auto', temperature=0.0, beams=3, repetition_penalty=1.0):
    model.eval()
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt')['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = inputs.shape[1] * 2
    result = model.generate(
        inputs, 
        num_return_sequences=n or 1, 
        do_sample=False, 
        temperature=temperature, 
        repetition_penalty=repetition_penalty, 
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
        early_stopping=True,
#         top_k=50,
#         top_p=0.95,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n:
        return texts[0][len(text[0])-len(rephrase_token):]
    return texts

In [35]:
import sys
sys.path.insert(1, '../evaluation/')
from metrics import count_metrics

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [36]:
test_text = ['Начиная с премирования по итогам работы за январь 2021 г., руководствоваться перечнем операций и коэффициентами пересчета продуктов в условные продукты (далее - УП) для менеджеров по продажам в соответствии с Приложением к настоящему Распоряжению.',
            'В случае подтверждения устранения Катастрофической или Серьезной ошибки Заказчик самостоятельно осуществляет тиражирование ПО в своих подразделениях и филиалах.',
            'Контроль за исполнением настоящего Распоряжения оставляю за собой.',
            'Утвердить перечень автоматизированных систем и информационных ресурсов Банка, доступных к подключению Cотрудникам (Приложение 1).']

In [37]:
for test_string in test_text:
    print(f"ORIGINAL: {test_string}")
    print("REPHRASED: ", paraphrase([test_string + rephrase_token], model,temperature=1, beams=10, repetition_penalty=10.0))
    print("####", end='\n\n')

ORIGINAL: Начиная с премирования по итогам работы за январь 2021 г., руководствоваться перечнем операций и коэффициентами пересчета продуктов в условные продукты (далее - УП) для менеджеров по продажам в соответствии с Приложением к настоящему Распоряжению.
REPHRASED:  Начиная с премирования по итогам работы за январь 2021 г., руководствоваться перечнем операций и коэффициентами пересчета продуктов в условные продукты (далее - УП) для менеджеров по продажам по Приложению к этому Распоряжению.
####

ORIGINAL: В случае подтверждения устранения Катастрофической или Серьезной ошибки Заказчик самостоятельно осуществляет тиражирование ПО в своих подразделениях и филиалах.
REPHRASED:  При подтверждении устранения Катастрофической или Серьезной ошибки Заказчик самостоятельно тиражирует ПО в подразделениях и филиалах.
####

ORIGINAL: Контроль за исполнением настоящего Распоряжения оставляю за собой.
REPHRASED:  Исполнение этого Распоряжения оставляю за собой.
####

ORIGINAL: Утвердить перечень 