Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).

In [2]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git
!pip install -q transformers

Cloning into 'character-tokenizer'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20 (delta 5), reused 10 (delta 3), pack-reused 0[K
Unpacking objects: 100% (20/20), 5.87 KiB | 1.47 MiB/s, done.


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import string
import sys

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertForTokenClassification, BertConfig
from transformers import DebertaV2ForTokenClassification, DebertaV2Config
from transformers import get_cosine_schedule_with_warmup
sys.path.append("/kaggle/working/character-tokenizer")
from charactertokenizer import CharacterTokenizer


In [4]:
chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)
example = "Привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [5]:
df = pd.read_csv("/kaggle/input/stresses/all_accents.tsv", sep="\t", header=None)
df = df.rename({0: "word", 1:"gt"}, axis=1)

In [8]:
train_data, val_data = train_test_split(df, test_size=0.5, random_state=42)

In [9]:
id2cls = {
    0: "O",
    1: "STREES"
}

cls2id = {
    "O": 0,
    "STREES": 1
}
MAX_LENGTH = 100

def get_labels_for_text(gt_word: str, get_indexes=True):
    result = []
    prev_cahr = None
    for char in gt_word:
        if prev_cahr == "^":
            prev_cahr = None
            continue
        if char == "^":
            result.append("STREES")
            prev_cahr = "^"
        else:
            result.append("O")
    if get_indexes:
        result = [cls2id[value] for value in result]
    return result

class StressDataset(Dataset):
    def __init__(self, df):
        super(StressDataset, self).__init__()
        df = df.reset_index(drop=True).copy()
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        line = self.df.iloc[idx]
        encoded_dict = tokenizer.encode_plus(
                            line["word"],                     
                            add_special_tokens = True, 
                            max_length = MAX_LENGTH,           
                            padding='max_length',
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                            return_special_tokens_mask=True,
                    )
        lebels = get_labels_for_text(line["gt"])
        encoded_dict['input_ids'] = encoded_dict['input_ids'].squeeze()
        encoded_dict['special_tokens_mask'] = encoded_dict['special_tokens_mask'].squeeze()
        encoded_dict['attention_mask'] = encoded_dict['attention_mask'].squeeze()
        lebels = torch.tensor([-100] + lebels + [-100] * (encoded_dict['input_ids'].shape[-1] - (1 + len(lebels)))).long()
        return (encoded_dict, lebels)

In [10]:
dataset_train = StressDataset(train_data)
dataset_val = StressDataset(val_data)
train_loader = DataLoader(dataset_train, batch_size=784, shuffle=True, num_workers=2)
val_loader = DataLoader(dataset_val, batch_size=784, shuffle=False, num_workers=2)

In [11]:
from transformers import BertForTokenClassification, BertConfig
from transformers import DebertaV2ForTokenClassification, DebertaV2Config


In [8]:
configuration = DebertaV2Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=128,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = DebertaV2ForTokenClassification(configuration,)
model.classifier = torch.nn.Linear(model.classifier.in_features, len(id2cls))
configuration = model.config
model.to(device)
None

In [17]:
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler):
    model.train()
    lst_losses = []
    correct_elements = 0
    total_elements = 0
    for idx, batch in tqdm(enumerate(train_loader)):

        inputs, labels = batch[0].to(device), batch[1].to(device)
        outputs = model(
            input_ids = inputs['input_ids'],
            attention_mask =inputs['attention_mask'],
            labels = labels,
        )
        loss, scores = outputs.loss, outputs.logits
        optimizer.zero_grad()
        lst_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

        # for ind_elem in range(inputs['input_ids'].shape[0]):
        #     x1 = torch.masked_select(scores.argmax(dim=-1)[0], inputs['special_tokens_mask'][0] == 0)
        #     x2 = torch.masked_select(labels[0], labels[0] != -100)
        #     if torch.equal(x1, x2):
        #         correct_elements += 1
        #     total_elements += 1
    avg_loss = np.array(lst_losses).mean()
    print(f"Train: epoch {epoch} | loss = {avg_loss} ")
    # print(f"Train: epoch {epoch} | loss = {avg_loss} | accuracy = {round((correct_elements / total_elements), 4)* 100}")

In [18]:
@torch.no_grad
def validate_one_epoch(epoch, model, val_loader, optimizer):
    model.eval()
    lst_losses = []
    correct_elements = 0
    total_elements = 0
    for idx, batch in tqdm(enumerate(val_loader)):
        inputs, labels = batch[0].to(device), batch[1].to(device)
        outputs = model(
            input_ids = inputs['input_ids'],
            attention_mask =inputs['attention_mask'],
            labels = labels,
        )
        loss, scores = outputs.loss, outputs.logits
        lst_losses.append(loss.item())
        for ind_elem in range(inputs['input_ids'].shape[0]):
            x1 = torch.masked_select(scores[ind_elem, :, :].argmax(dim=-1), inputs['special_tokens_mask'][ind_elem] == 0)
            x2 = torch.masked_select(labels[ind_elem], labels[ind_elem] != -100)
            if torch.equal(x1, x2):
                correct_elements += 1
            total_elements += 1
    avg_loss = np.array(lst_losses).mean()
    print(f"Val: epoch {epoch} | loss = {avg_loss} | accuracy = {round((correct_elements / total_elements), 4)* 100}")

In [12]:
NUM_EPOCHS = 10
num_warmup_steps = 1000
num_training_steps= NUM_EPOCHS * len(train_loader)
num_cycles = 0.5
last_epoch = -1
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [13]:
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
    num_cycles=num_cycles,
    last_epoch=last_epoch
)

In [14]:
for epoch in range(NUM_EPOCHS):
    train_one_epoch(epoch, model, train_loader, optimizer, scheduler)
    validate_one_epoch(epoch, model, val_loader, optimizer)

1072it [03:44,  4.77it/s]

Train: epoch 0 | loss = 0.17415615391514416 



1072it [04:45,  3.76it/s]

Val: epoch 0 | loss = 0.10324895262384592 | accuracy = 71.63000000000001



1072it [03:43,  4.80it/s]

Train: epoch 1 | loss = 0.10520243072020474 



1072it [04:46,  3.74it/s]

Val: epoch 1 | loss = 0.08784521118835059 | accuracy = 75.42999999999999



1072it [03:40,  4.86it/s]

Train: epoch 2 | loss = 0.09303767097168672 



1072it [04:43,  3.78it/s]

Val: epoch 2 | loss = 0.07956213059265223 | accuracy = 79.5



1072it [03:41,  4.83it/s]

Train: epoch 3 | loss = 0.08466518356867914 



1072it [04:45,  3.76it/s]

Val: epoch 3 | loss = 0.07056461647152901 | accuracy = 80.92



1072it [03:40,  4.86it/s]

Train: epoch 4 | loss = 0.07750669530759663 



1072it [04:46,  3.74it/s]

Val: epoch 4 | loss = 0.06255702112815273 | accuracy = 83.69



1072it [03:40,  4.87it/s]

Train: epoch 5 | loss = 0.07117682182118852 



1072it [04:44,  3.77it/s]

Val: epoch 5 | loss = 0.05713902370059001 | accuracy = 85.02



1072it [03:44,  4.77it/s]

Train: epoch 6 | loss = 0.06599410261555727 



1072it [04:45,  3.75it/s]

Val: epoch 6 | loss = 0.052929590348460114 | accuracy = 86.22999999999999



1072it [03:42,  4.82it/s]

Train: epoch 7 | loss = 0.06220201307685295 



1072it [04:43,  3.78it/s]

Val: epoch 7 | loss = 0.050113979306544605 | accuracy = 86.87



1072it [03:37,  4.94it/s]

Train: epoch 8 | loss = 0.059868936075954074 



1072it [04:40,  3.82it/s]

Val: epoch 8 | loss = 0.048879904825407175 | accuracy = 87.32



1072it [03:38,  4.91it/s]

Train: epoch 9 | loss = 0.0589625537673484 



1072it [04:41,  3.81it/s]

Val: epoch 9 | loss = 0.048701394331961204 | accuracy = 87.35000000000001





In [25]:
model.eval()
def make_stresses(word):
    inputs = tokenizer(
        word,
        add_special_tokens=True,
        return_special_tokens_mask=True,
        return_tensors = 'pt',
    )
    outputs = model(
            input_ids = inputs['input_ids'].to(device),
            attention_mask =inputs['attention_mask'].to(device),
        )
    result = outputs.logits.argmax(dim=-1).cpu()
    result = torch.masked_select(result, inputs['special_tokens_mask'] == 0).tolist()

    output = ""
    for char, label in zip(word, result):
        if label == 1:
            output +=  "^" + char
        else:
            output += char
    return output


In [18]:
make_stresses("гора")

'г^ора'

In [17]:
val_data

Unnamed: 0,word,gt
128786,вандейские,ванд^ейские
1678336,яремного,яр^емного
331492,доезживавший,до^езживавший
1386829,скобками,ск^обками
784530,неудовлетворения,неудовлетвор^ения
...,...,...
344562,дорезавших,дор^езавших
971596,перекладывается,перекл^адывается
198482,вспрыгну,вспр^ыгну
172604,возвеселюсь,возвесел^юсь


### BERT

In [20]:
configuration = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertForTokenClassification(configuration,)
model.classifier = torch.nn.Linear(model.classifier.in_features, len(id2cls))
configuration = model.config
model.to(device)
None

In [21]:
train_loader = DataLoader(dataset_train, batch_size=392, shuffle=True, num_workers=2)
val_loader = DataLoader(dataset_val, batch_size=392, shuffle=False, num_workers=2)

In [22]:
NUM_EPOCHS = 10
num_warmup_steps = 1000
num_training_steps= NUM_EPOCHS * len(train_loader)
num_cycles = 0.5
last_epoch = -1
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
    num_cycles=num_cycles,
    last_epoch=last_epoch
)

In [23]:
for epoch in range(1, NUM_EPOCHS + 2):
    train_one_epoch(epoch, model, train_loader, optimizer, scheduler)
    if epoch % 2 == 1:
        validate_one_epoch(epoch, model, val_loader, optimizer)

2144it [05:08,  6.95it/s]

Train: epoch 1 | loss = 0.1345048527295854 



2144it [05:11,  6.87it/s]

Val: epoch 1 | loss = 0.09143034191871646 | accuracy = 74.79



2144it [05:08,  6.95it/s]

Train: epoch 2 | loss = 0.09334283635895978 



2144it [05:07,  6.97it/s]

Train: epoch 3 | loss = 0.08122087124589164 



2144it [05:09,  6.93it/s]

Val: epoch 3 | loss = 0.0665533260299838 | accuracy = 82.83



2144it [05:07,  6.97it/s]

Train: epoch 4 | loss = 0.07101927969931389 



2144it [05:07,  6.97it/s]

Train: epoch 5 | loss = 0.06311107514627667 



2144it [05:09,  6.92it/s]

Val: epoch 5 | loss = 0.050795576236308065 | accuracy = 87.42999999999999



2144it [05:07,  6.97it/s]

Train: epoch 6 | loss = 0.05631968429235658 



2144it [05:07,  6.96it/s]

Train: epoch 7 | loss = 0.050484769670953 



2144it [05:09,  6.93it/s]

Val: epoch 7 | loss = 0.04019391695002733 | accuracy = 90.13



2144it [05:07,  6.97it/s]

Train: epoch 8 | loss = 0.045913509357331406 



2144it [05:07,  6.97it/s]

Train: epoch 9 | loss = 0.04307897882657682 



2144it [05:09,  6.92it/s]

Val: epoch 9 | loss = 0.03565083702404136 | accuracy = 91.24



2144it [05:07,  6.96it/s]

Train: epoch 10 | loss = 0.04156903883289379 



2144it [05:07,  6.97it/s]

Train: epoch 11 | loss = 0.04151147577578007 



2144it [05:09,  6.93it/s]

Val: epoch 11 | loss = 0.035365167496996974 | accuracy = 91.36





In [26]:
make_stresses("гора")

'г^ора'

In [29]:
show_sample = val_data.sample(60)

In [33]:
show_sample["predictions"] = show_sample['word'].apply(lambda x: make_stresses(x))

In [34]:
show_sample

Unnamed: 0,word,gt,predictions
1299361,растереблена,растереблен^а,растереблен^а
1561505,утеплившей,утепл^ившей,утепл^ившей
1117192,поступлении,поступл^ении,поступл^ении
909301,отнимавшемся,отним^авшемся,отним^авшемся
938381,ошельмованный,ошельм^ованный,ошельм^ованный
522609,канадскому,кан^адскому,кан^адскому
679883,модней,модн^ей,модней
911430,отофон,отоф^он,отоф^он
1595533,харчевать,харчев^ать,харчев^ать
1360072,святское,св^ятское,св^ятское


In [36]:
show_sampl_2 = val_data.sample(60)
show_sampl_2["predictions"] = show_sampl_2['word'].apply(lambda x: make_stresses(x))
show_sampl_2

Unnamed: 0,word,gt,predictions
240207,вытаращивавшиеся,вытар^ащивавшиеся,вытар^ащивавшиеся
1188667,пробродившею,проброд^ившею,проброд^ившею
705068,нагловатыми,наглов^атыми,наглов^атыми
31755,анастасом,анаст^асом,анаст^асом
1228663,протравлявший,протравл^явший,протравл^явший
759742,недоразвитом,недор^азвитом,недор^азвитом
947398,парафиновый,параф^иновый,параф^иновый
1649850,шунтировалось,шунт^ировалось,шунт^ировалось
1388080,скомпонованному,скомпон^ованному,скомпон^ованному
987645,перераспределительным,перераспредел^ительным,перераспредел^ительным


Удалось достичь качества accuracy = 91.36%

Использовались следующие параметры

In [None]:
# configuration = BertConfig(
#     vocab_size=tokenizer.vocab_size,
#     hidden_size=256,
#     hidden_dropout_prob=0.2,
#     attention_probs_dropout_prob=0.2,
#     num_hidden_layers=4,
#     num_attention_heads=4,
#     intermediate_size=512,
# )

# train_loader = DataLoader(dataset_train, batch_size=392, shuffle=True, num_workers=2)
# val_loader = DataLoader(dataset_val, batch_size=392, shuffle=False, num_workers=2)


# NUM_EPOCHS = 10
# num_warmup_steps = 1000
# num_training_steps= NUM_EPOCHS * len(train_loader)
# num_cycles = 0.5
# last_epoch = -1
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# scheduler = get_cosine_schedule_with_warmup(
#     optimizer=optimizer,
#     num_warmup_steps=num_warmup_steps,
#     num_training_steps=num_training_steps,
#     num_cycles=num_cycles,
#     last_epoch=last_epoch
# )