# Дипломная работа на тему: От нейросетевого анализа к словарному: методы разработки морфологических анализаторов на основе данных, размеченных нейросетью
### Автор: _Феоктистова Эмма Александровна, 4 курс ФиКЛ_
### Научный руководитель: _Проф. Ляшевская О. Н._

## 0. Необходимые импорты

In [None]:
import conllu
from conllu import parse

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange

In [None]:
from typing import List, Dict, Union

## 2. T5

In [None]:
!pip install transformers[torch]==4.3
!pip install ufal.udpipe
!pip install ruprompts
!pip install datasets

Collecting transformers[torch]==4.3
  Using cached transformers-4.3.0-py3-none-any.whl (1.8 MB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp39-cp39-win_amd64.whl (2.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3


ERROR: Could not install packages due to an OSError: [WinError 5] Отказано в доступе: 'C:\\ProgramData\\Miniconda3\\Lib\\site-packages\\~%kenizers\\tokenizers.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.







Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.3
    Uninstalling tokenizers-0.10.3:
      Successfully uninstalled tokenizers-0.10.3
Successfully installed tokenizers-0.13.3








Указываем путь к используемым датасетам:

In [None]:
train_data = pd.read_csv('./prepared_data/SynTagRus/syntagrus_train_data.csv')
val_data = pd.read_csv('./prepared_data/SynTagRus/syntagrus_dev_data.csv')
full_train_data = pd.concat([train_data, val_data], ignore_index=True)
full_train_data

Unnamed: 0,form,data
0,Анкета,"анкета,NOUN,Inan,Nom,Fem,Sing"
1,.,".,PUNCT,None"
2,Начальник,"начальник,NOUN,Anim,Nom,Masc,Sing"
3,областного,"областной,ADJ,Gen,Pos,Neut,Sing"
4,управления,"управление,NOUN,Inan,Gen,Neut,Sing"
...,...,...
1359887,внимание,"внимание,NOUN,Inan,Nom,Neut,Sing"
1359888,-,"-,PUNCT,None"
1359889,большая,"большой,ADJ,Nom,Pos,Fem,Sing"
1359890,редкость,"редкость,NOUN,Inan,Nom,Fem,Sing"


In [None]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        """
        x - dict; example (from toxic comment):
            {
                'input_ids': [[55, 27, 103, 172], [157, 24529, 4088, 2], ...],
                'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], ...]
            }
        y - the same dict as x (from neutral comment)
        """
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        """
        idx - index of current object
        
        returns dict:
            {
                'input_ids': [157, 24529, 4088, 2] # 'input_ids' from `x` for `idx
                'attention_mask': [1, 1, 1, 1] # 'attention_mask' from `x` for `idx`
                'decoder_attention_mask': [1, 1, 1, 1] # 'attention_mask' from `y` for `idx`
                'labels': [422, 584, 17940, 246] # 'input_ids' from `y` for `idx`
            }
        """
        assert idx < len(self.x['input_ids']) # idx must be less than len of 'toxic' list (list from column `toxic_comment`)
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2

In [None]:
def cleanup():
    """
    A helpful function to clean all cached batches.
    """
    gc.collect()
    torch.cuda.empty_cache()

Выбираем модель (мы использовали более маленькую модель _T5-small_ для быстроты обучения): 

In [None]:
# model_name = 'sberbank-ai/ruT5-base'
model_name = 't5-small'

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

Используем уже предобученный токенайзер и преобразуем данные:

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
x, y = full_train_data['form'].tolist(), full_train_data['data'].tolist()

In [None]:
dataset = PairsDataset(tokenizer(x), tokenizer(y))

In [None]:
dataset.__getitem__(idx=0)

{'input_ids': [3, 2, 7184, 6652, 1757, 15517, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'decoder_attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [3,
  25873,
  6652,
  1757,
  15517,
  6,
  7400,
  7443,
  6,
  1570,
  152,
  6,
  4168,
  51,
  6,
  371,
  15,
  51,
  6,
  134,
  53,
  1]}

Определяем параметры обучающего датасета:

In [None]:
train_dataloader = DataLoader(
    dataset, 
    batch_size=8, 
    drop_last=False, 
    shuffle=True
)

In [None]:
cleanup()

Функции для оценки модели и ее обучения с настраиваемыми параметрами:

In [None]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

In [None]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=30, 
    max_steps=1_000, 
    lr=3e-5,
    gradient_accumulation_steps=1, 
    cleanup_step=100,
    report_step=300,
    window=100,
):
    cleanup()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)

    ewm_loss = 0
    step = 0
    model.train()

    for epoch in trange(max_epochs):
        print(step, max_steps)
        if step >= max_steps:
            break
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
            try:
                batch['labels'][batch['labels']==0] = -100
                loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
                loss.backward()
            except Exception as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue
            if i and i % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                step += 1
                if step >= max_steps:
                    break

            if i % cleanup_step == 0:
                cleanup()

            w = 1 / min(i+1, window)
            ewm_loss = ewm_loss * (1-w) + loss.item() * w # for averaging loss values
            tq.set_description(f'loss: {ewm_loss:4.4f}')

            if (i and i % report_step == 0 or i == len(train_dataloader)-1)  and val_dataloader is not None:
                model.eval()
                eval_loss = evaluate_model(model, val_dataloader)
                model.train()
                print(f'epoch {epoch}, step {i}/{step}: train loss: {ewm_loss:4.4f}  val loss: {eval_loss:4.4f}')
                
            if step % 1000 == 0:
                model.save_pretrained(f't5_base_{dname}_{steps}')
        
    cleanup()

In [None]:
def train_model(x, y, model_name, test_size=0.1, batch_size=8, **kwargs):
    """
    """
    model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = PairsDataset(tokenizer(x1), tokenizer(y1))
    test_dataset = PairsDataset(tokenizer(x2), tokenizer(y2))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)

    train_loop(model, train_dataloader, val_dataloader, **kwargs)
    return model

In [None]:
datasets = {
    'train': full_train_data
}

Используем _DataCollatorWithPadding_ для приведения наших данных в единный формат (в задачах NLP входные данные  обычно представляют собой последовательности токенов разной длины, данный класс приводит их к одной длине с помощью паддингов).

In [None]:
class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        """
        features example:
            [    
                {"foo": [1, 2, 3], "bar": torch.tensor([0.1, 0.2, 0.3])},
                {"foo": [4, 5, 6], "bar": torch.tensor([0.4, 0.5, 0.6])},
                {"foo": [7, 8, 9], "bar": torch.tensor([0.7, 0.8, 0.9])},
            ]
        """
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}


Сохраняем промежуточные веса модели в определенное количество пройденных шагов (steps), cледим за уменьшением loss yа обучающем и валидационном датасетах:

In [None]:
%%time
for steps in [1000, 10000, 50000, 100000, 500000, 1000000, 1359892]:
    for dname, d in datasets.items():
        print(f'\n\n\n  {dname}  {steps} \n=====================\n\n')
        model = train_model(
            d['form'].tolist(), 
            d['data'].tolist(), 
            model_name=model_name, 
            batch_size=2, 
            max_epochs=1000, 
            max_steps=steps
        )
        model.save_pretrained(f't5_small_{dname}_{steps}')




  train  1000 




  0%|          | 0/1000 [00:00<?, ?it/s]

0 1000


  0%|          | 0/611951 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 3.2998  val loss: 4.1076
epoch 0, step 600/600: train loss: 1.8564  val loss: 4.0339
epoch 0, step 900/900: train loss: 1.3853  val loss: 4.1101
1000 1000



  train  10000 




  0%|          | 0/1000 [00:00<?, ?it/s]

0 10000


  0%|          | 0/611951 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 3.1746  val loss: 4.1643
epoch 0, step 600/600: train loss: 1.8473  val loss: 4.0684
epoch 0, step 900/900: train loss: 1.3142  val loss: 4.2081
epoch 0, step 1200/1200: train loss: 1.0787  val loss: 4.2530
epoch 0, step 1500/1500: train loss: 0.9359  val loss: 4.2127
epoch 0, step 1800/1800: train loss: 0.7887  val loss: 4.1764
epoch 0, step 2100/2100: train loss: 0.7222  val loss: 4.0566
epoch 0, step 2400/2400: train loss: 0.6963  val loss: 4.0206
epoch 0, step 2700/2700: train loss: 0.6862  val loss: 3.9439
epoch 0, step 3000/3000: train loss: 0.6125  val loss: 3.8941
epoch 0, step 3300/3300: train loss: 0.6165  val loss: 4.0259
epoch 0, step 3600/3600: train loss: 0.6058  val loss: 3.8727
epoch 0, step 3900/3900: train loss: 0.5494  val loss: 3.8483
epoch 0, step 4200/4200: train loss: 0.4929  val loss: 3.7440
epoch 0, step 4500/4500: train loss: 0.5050  val loss: 3.7352
epoch 0, step 4800/4800: train loss: 0.4978  val loss: 3.5622
epoch 0, step 

  0%|          | 0/1000 [00:00<?, ?it/s]

0 50000


  0%|          | 0/611951 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 3.2860  val loss: 4.1301
epoch 0, step 600/600: train loss: 1.8862  val loss: 4.1664
epoch 0, step 900/900: train loss: 1.3979  val loss: 4.2682
epoch 0, step 1200/1200: train loss: 1.0555  val loss: 4.3132
epoch 0, step 1500/1500: train loss: 0.8961  val loss: 4.2703
epoch 0, step 1800/1800: train loss: 0.8397  val loss: 4.2189
epoch 0, step 2100/2100: train loss: 0.7848  val loss: 4.2034
epoch 0, step 2400/2400: train loss: 0.7126  val loss: 4.1734
epoch 0, step 2700/2700: train loss: 0.6540  val loss: 4.2152
epoch 0, step 3000/3000: train loss: 0.6020  val loss: 3.9939
epoch 0, step 3300/3300: train loss: 0.6346  val loss: 3.9203
epoch 0, step 3600/3600: train loss: 0.5841  val loss: 3.8659
epoch 0, step 3900/3900: train loss: 0.5627  val loss: 3.8412
epoch 0, step 4200/4200: train loss: 0.5454  val loss: 3.6696
epoch 0, step 4500/4500: train loss: 0.4919  val loss: 3.7257
epoch 0, step 4800/4800: train loss: 0.5402  val loss: 3.7493
epoch 0, step 

epoch 0, step 39300/39300: train loss: 0.2167  val loss: 2.0731
epoch 0, step 39600/39600: train loss: 0.2218  val loss: 1.9828
epoch 0, step 39900/39900: train loss: 0.1817  val loss: 2.0424
epoch 0, step 40200/40200: train loss: 0.1768  val loss: 2.0081
epoch 0, step 40500/40500: train loss: 0.2090  val loss: 2.0733
epoch 0, step 40800/40800: train loss: 0.1971  val loss: 2.0707
epoch 0, step 41100/41100: train loss: 0.1910  val loss: 2.1339
epoch 0, step 41400/41400: train loss: 0.2221  val loss: 2.1955
epoch 0, step 41700/41700: train loss: 0.1972  val loss: 2.2512
epoch 0, step 42000/42000: train loss: 0.1993  val loss: 2.1843
epoch 0, step 42300/42300: train loss: 0.1922  val loss: 2.2294
epoch 0, step 42600/42600: train loss: 0.1761  val loss: 2.2913
epoch 0, step 42900/42900: train loss: 0.2075  val loss: 2.2996
epoch 0, step 43200/43200: train loss: 0.1973  val loss: 2.2675
epoch 0, step 43500/43500: train loss: 0.1751  val loss: 2.2776
epoch 0, step 43800/43800: train loss: 0

  0%|          | 0/1000 [00:00<?, ?it/s]

0 100000


  0%|          | 0/611951 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 3.2536  val loss: 4.0554
epoch 0, step 600/600: train loss: 1.8092  val loss: 4.1634
epoch 0, step 900/900: train loss: 1.3685  val loss: 4.3373
epoch 0, step 1200/1200: train loss: 1.0699  val loss: 4.4141
epoch 0, step 1500/1500: train loss: 0.9671  val loss: 4.2847
epoch 0, step 1800/1800: train loss: 0.8041  val loss: 4.1765
epoch 0, step 2100/2100: train loss: 0.7396  val loss: 3.9864
epoch 0, step 2400/2400: train loss: 0.6766  val loss: 3.8859
epoch 0, step 2700/2700: train loss: 0.6847  val loss: 3.8507
epoch 0, step 3000/3000: train loss: 0.6101  val loss: 3.7432
epoch 0, step 3300/3300: train loss: 0.6280  val loss: 3.6475
epoch 0, step 3600/3600: train loss: 0.5601  val loss: 3.4806
epoch 0, step 3900/3900: train loss: 0.5598  val loss: 3.4594
epoch 0, step 4200/4200: train loss: 0.5082  val loss: 3.4106
epoch 0, step 4500/4500: train loss: 0.5514  val loss: 3.5187
epoch 0, step 4800/4800: train loss: 0.4928  val loss: 3.2502
epoch 0, step 

## RuPrompts

In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer

Используем уже предобученную модель _rugpt3large_based_on_gpt2_:

In [None]:
backbone_id = "sberbank-ai/rugpt3large_based_on_gpt2"

model = GPT2LMHeadModel.from_pretrained(backbone_id)
tokenizer = AutoTokenizer.from_pretrained(backbone_id, pad_token="<pad>", eos_token="<pad>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from ruprompts import PromptFormat

prompt_format = PromptFormat("<P*100>{cluster}<P*20>")

Определяем параметризацию обучаемых вложений:

In [None]:
from ruprompts import TensorPromptProvider
from transformers import set_seed

set_seed(1)

prompt_provider = TensorPromptProvider()

Собираем формат prompt и провайдер prompt в объект prompt и применяем его к модели и токенизатору, т.е. добавляем в токенизатор специальные токены и модифицируем слой входных вложений модели:

In [None]:
from ruprompts import Prompt

prompt = Prompt(prompt_format, prompt_provider)
prompt.patch(model, tokenizer)

Начинаем предпроцессинг данных:

In [None]:
from datasets import load_dataset
datasets = load_dataset('csv', data_files={'train': './data/clustered_data/taiga_train_data_dbscan__clustered.csv', 'val': './data/clustered_data/taiga_dev_data_dbscan__clustered.csv'})
train_dataset = datasets["train"]
val_dataset = datasets["val"]

Downloading and preparing dataset csv/default to C:/Users/user/.cache/huggingface/datasets/csv/default-3cc147501614d040/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/user/.cache/huggingface/datasets/csv/default-3cc147501614d040/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from ruprompts import Text2TextPreprocessor

preprocessor = Text2TextPreprocessor(
    prompt_format=prompt_format,
    tokenizer=tokenizer,
    target_field="data",
    max_tokens=1792,
    truncation_field="cluster",
)

train_dataset = train_dataset.map(preprocessor)
valid_dataset = val_dataset.map(preprocessor)

Map:   0%|          | 0/74900 [00:00<?, ? examples/s]

Определяем параметры обучения, выбираем размер батча, количество степов, функции потерь и т.д.:

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./try23_05_taiga_dbscan",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=1,
    eval_steps=1000, #100
    save_steps=1000, #100
    logging_steps=1000, #100
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    learning_rate=0.1,
    max_steps=15000, #100000
    report_to="tensorboard",
    # report_to=["tensorboard", "wandb"],  # uncomment to log to WandB
    logging_dir="logs",
    seed=1,
)

Выбираем параметры оптимизации модели:

In [None]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(prompt_provider.parameters(), lr=training_args.learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=2000,
    num_training_steps=training_args.max_steps,
)

Начинаем обучение модели:

In [None]:
from transformers import Trainer
from ruprompts.callbacks import (
    FreezeTransformerUnfreezePrompt,
    ReduceCheckpoint,
    SavePretrainedPrompt,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=preprocessor.collate_fn(),
    optimizers=(optimizer, scheduler),
    callbacks=[FreezeTransformerUnfreezePrompt(), ReduceCheckpoint(), SavePretrainedPrompt(prompt)],
)

trainer.train()