Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

Напишите класс для Dataset/Dataloder и азбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
Попробуйте несколько моделей: Bert, Albert, Deberta. (3 балла) Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

In [1]:
!pip install transformers
!pip install lightning
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

Collecting lightning
  Downloading lightning-2.2.3-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading lightning-2.2.3-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.2.3
Cloning into 'character-tokenizer'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20 (delta 5), reused 10 (delta 3), pack-reused 0[K
Unpacking objects: 100% (20/20), 5.87 KiB | 1.17 MiB/s, done.


In [2]:
import numpy as np
import pandas as pd

import string
import sys

sys.path.append("/kaggle/working/character-tokenizer")
from charactertokenizer import CharacterTokenizer

import torch
from torch.utils.data import Dataset, DataLoader

import wandb

import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import LearningRateMonitor
from lightning.pytorch.loggers import WandbLogger


from transformers import AutoTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer

from sklearn.model_selection import train_test_split

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
df = pd.read_csv("/kaggle/input/all-accents/all_accents.tsv", sep='\t', names=['original', 'accented'])

In [5]:
df

Unnamed: 0,original,accented
0,-де,-д^е
1,-ка,-к^а
2,-либо,-л^ибо
3,-нибудь,-ниб^удь
4,-с,-с
...,...,...
1680530,ӂюль-верновский,ӂюль-в^ерновский
1680531,ӂюрить,ӂюр^ить
1680532,ӂӂение,ӂӂ^ение
1680533,ӂӂенный,ӂӂенный


In [6]:
# example = "Привет"
# tokens = tokenizer(example)
# print(tokens)

In [6]:
class AccentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df.reset_index()
        self.max_length = max_length
        self.df = self.df.drop(columns=['index'])
        self.tokenizer = tokenizer
    
    def __getitem__(self, idx):
        orig = self.df.loc[idx]['original']
        accent_id = self.df.loc[idx]['accented'].find('^')

        tokenized_item = self.tokenizer(orig,
                                        max_length=self.max_length,
                                        padding='max_length',
                                        truncation=True,
                                        return_tensors='pt')
        
        del tokenized_item['token_type_ids']
        
        tokenized_item['input_ids'] = tokenized_item['input_ids'].flatten()
        tokenized_item['attention_mask'] = tokenized_item['attention_mask'].flatten()
        
        tokenized_item['labels'] = torch.zeros_like(tokenized_item['input_ids'].flatten())
        tokenized_item['labels'][accent_id + 1] = 1
#         tokenized_item['raw_word'] = torch.String(orig)
        
        return tokenized_item
    
    def __len__(self):
        return len(self.df)



In [7]:
df.head(len(df) // 2)

Unnamed: 0,original,accented
0,-де,-д^е
1,-ка,-к^а
2,-либо,-л^ибо
3,-нибудь,-ниб^удь
4,-с,-с
...,...,...
840262,объективное,объект^ивное
840263,объективной,объект^ивной
840264,объективном,объект^ивном
840265,объективному,объект^ивному


In [7]:
model_max_length = 256
batch_size = 256
chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
tokenizer = CharacterTokenizer(chars, model_max_length)

# df = df.sample(frac=1)
df = df.head(len(df) // 5)

train_df, val_df = train_test_split(df, test_size=0.5)

train_dataset = AccentDataset(train_df, tokenizer, model_max_length)
val_dataset = AccentDataset(val_df.iloc[:len(val_df)//2, :], tokenizer, model_max_length)
test_dataset = AccentDataset(val_df.iloc[len(val_df)//2:, :], tokenizer, model_max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=3)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=3)

In [18]:
val_dataset[0]

{'input_ids': tensor([ 0, 12, 64, 42, 38, 36, 18, 36, 36, 38, 14, 38,  1,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4

In [10]:
next(iter(train_dataloader))

{'input_ids': tensor([[ 0, 36,  8,  ...,  4,  4,  4],
        [ 0, 30, 38,  ...,  4,  4,  4],
        [ 0, 34,  8,  ...,  4,  4,  4],
        ...,
        [ 0, 40, 18,  ...,  4,  4,  4],
        [ 0, 16, 38,  ...,  4,  4,  4],
        [ 0, 40, 42,  ...,  4,  4,  4]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}

In [7]:
from transformers import DebertaV2Config, DebertaV2ForTokenClassification

deberta_config = DebertaV2Config(hidden_size=256,
                        intermediate_size=1024,
                        max_length=40,
                        model_type='deberta-v2',
                        num_attention_heads=8,
                        num_hidden_layers=4,
                        position_biased_input=True,
                        relative_attention=True)

deberta_model = DebertaV2ForTokenClassification(config)

In [12]:
model

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-3): 4 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=256, out_features=256, bias=True)
              (key_proj): Linear(in_features=256, out_features=256, bias=True)
              (value_proj): Linear(in_features=256, out_features=256, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): Lay

In [13]:
test_data = next(iter(train_dataloader))
model_output = model(test_data['input_ids'], attention_mask=test_data['attention_mask'], labels=test_data['labels'])
print(model_output.logits)
print(model_output.logits.size())
print(test_data['input_ids'].size())


tensor([[[ 0.0036,  0.5507],
         [-0.2701, -0.3176],
         [-0.1553, -0.4632],
         ...,
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[-0.3081,  0.2325],
         [ 0.0426, -0.4880],
         [ 0.1171, -0.2132],
         ...,
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[-0.2241,  0.5807],
         [-0.2065, -0.0069],
         [-0.5547,  0.3684],
         ...,
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        ...,

        [[-0.0894,  0.6162],
         [-0.1279, -0.0971],
         [-0.2654, -0.1469],
         ...,
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[-0.3648,  0.4931],
         [-0.4191, -0.2587],
         [-0.0527, -0.1440],
         ...,
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[-0.1769,  0.5233],
       

In [14]:
print(torch.argmax(model_output.logits[:, :, 1], dim=1))
torch.argmax(test_data['labels'], dim=1)

tensor([ 7,  7, 10,  5,  7,  0,  0,  7,  5,  7,  5,  7,  7, 10,  7,  7,  0,  5,
         5,  0,  6, 10, 11,  7, 12,  7,  7, 11,  0,  7,  5,  7,  7, 10,  7,  5,
         7,  6,  5,  7, 11,  2, 11,  0,  0,  0, 13,  7,  7,  7, 11, 11,  7,  5,
         7, 10, 11, 14,  7,  7,  0,  7,  7,  9,  0, 13,  0,  5,  7, 11,  7,  5,
        11,  7,  7,  0,  0, 12,  7,  4,  0,  7, 10,  7,  2,  0,  7,  5,  0, 11,
         7,  0, 13,  0,  7,  7,  0,  2,  5,  0, 13,  7,  5,  5,  7,  7,  7, 11,
         7,  4,  7,  7,  7, 10,  0,  7,  0,  5,  7,  7,  7,  7,  7,  7,  6,  0,
        13,  7,  0,  7,  5,  0,  0, 13,  0,  7, 10, 10, 13,  2,  2, 11,  7,  7,
         1, 11,  7,  0,  7,  7, 13,  2,  0,  0,  7,  7, 10,  7,  7,  4,  7,  7,
         5,  0,  7,  7, 11,  7,  7,  7,  0,  7,  7, 11,  4,  9,  7,  5,  0,  0,
         5,  0,  7,  7,  0,  0,  4,  9, 10,  7,  4,  6,  7, 11,  7,  4,  7, 13,
        13, 11,  7, 13,  5,  0,  7,  4,  7,  4,  7, 11, 21, 13,  0,  9, 10,  4,
        10,  0,  7, 11, 11,  0,  0, 13, 

tensor([ 8,  6,  7,  6, 10,  2,  5,  2,  6,  6, 10,  9,  8,  9,  6,  6,  5,  4,
         0,  5,  4,  5,  9,  4,  6,  6,  4,  6,  7,  2,  6,  9,  4,  9, 10,  6,
         4,  2,  6,  7,  7,  1,  7, 13,  8,  5, 11,  7,  6,  1,  6,  7,  8,  2,
         5,  6,  5,  6,  6,  5,  4,  7,  4,  2,  3,  8, 10,  9,  9,  5,  4,  7,
         7,  3, 10,  5,  7,  7,  4,  6,  6,  3,  7,  6,  5,  8,  7,  8,  6,  6,
         2,  7,  5,  7,  3,  6,  7,  7,  7,  6, 10,  3,  3,  7,  9,  6,  8,  7,
         8,  3,  3,  5,  2,  3,  4,  4,  8,  4,  7,  5,  6,  5,  6,  4,  5,  7,
         7,  8, 13,  3,  3,  3,  4,  5,  5,  6,  6,  4,  5,  2,  7,  6, 13,  5,
         7,  9,  4,  8,  6,  4,  5,  2, 10,  8,  4,  4,  6,  3, 14,  5,  5,  9,
         5,  9,  6,  6,  8,  8,  5,  4,  3,  6,  6,  6,  7, 10,  2,  7,  2,  5,
         4, 10,  5,  6,  2,  2,  4,  8,  6,  4,  5,  7,  5,  4,  7,  5, 11,  9,
         4,  9,  4,  8,  6,  5,  5,  7,  4, 10,  5,  7, 12, 10, 10,  7,  5,  8,
         3,  6,  2,  6,  7,  2,  8, 11, 

In [15]:
test_data['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [8]:
class AccentModel(pl.LightningModule):
    def __init__(self, model, tokenizer, learning_rate):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
#         self.save_hyperparameters()
    
    def training_step(self, batch, batch_idx):
        ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        preds = self.model(ids, attention_mask=attention_mask, labels=labels)
        loss = preds.loss
        
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        preds = self.model(ids, attention_mask=attention_mask, labels=labels)
        loss = preds.loss
        
#         print(preds)
        
        step_accuracy = torch.sum(torch.argmax(preds.logits[:, :, 1], dim=1) == torch.argmax(batch['labels'], dim=1)) / len(torch.argmax(batch['labels'], dim=1))
        
        self.log('validation_accuracy', step_accuracy)
        self.log('validation_loss', loss)
    
    def forward(self, x):
        ids = x['input_ids']
        attention_mask = x['attention_mask']
        labels = x['labels']
        preds = self.model(ids, attention_mask=attention_mask, labels=labels)
        
        return preds.logits
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return [optimizer]

**DEBERTA**

In [9]:
pl_deberta_model = AccentModel(deberta_model, tokenizer, 1e-5)
checkpoint_callback = ModelCheckpoint(monitor='validation_loss', mode='min', save_top_k=3)
lr_monitor = LearningRateMonitor(logging_interval='step')
wandb_logger = WandbLogger(project='hw9_nn_kirichenko', name="deberta_model_half_data_15epoch", job_type='train', save_dir='/kaggle/working/lightning_logs/version_1')
trainer = pl.Trainer(max_epochs=15, logger=wandb_logger, accelerator=device.type, callbacks=[checkpoint_callback, lr_monitor])

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model=pl_deberta_model,
           train_dataloaders=train_dataloader,
           val_dataloaders=val_dataloader)

[34m[1mwandb[0m: Currently logged in as: [33mtoly-kiri4enko[0m ([33mbstu[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | DebertaV2ForTokenClassification | 36.3 M
----------------------------------------------------------
36.3 M    Trainable params
0         Non-trainable params
36.3 M    Total params
145.388   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=15` reached.


In [11]:
pl_deberta_model.eval()
pl_deberta_model.to(device)

AccentModel(
  (model): DebertaV2ForTokenClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 256, padding_idx=0)
        (position_embeddings): Embedding(512, 256)
        (LayerNorm): LayerNorm((256,), eps=1e-07, elementwise_affine=True)
        (dropout): StableDropout()
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-3): 4 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=256, out_features=256, bias=True)
                (key_proj): Linear(in_features=256, out_features=256, bias=True)
                (value_proj): Linear(in_features=256, out_features=256, bias=True)
                (pos_dropout): StableDropout()
                (dropout): StableDropout()
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_features=

In [13]:
def compute_accuracy(model, test_dataloader):
    accuracy = 0
    for i, (batch) in enumerate(test_dataloader):
        batch = batch.to(device)
        model_output = pl_model(batch)
        accuracy += torch.sum(torch.argmax(model_output[:, :, 1], dim=1) == torch.argmax(batch['labels'], dim=1)) / len(torch.argmax(batch['labels'], dim=1))

    return accuracy / i

compute_accuracy(pl_deberta_model, test_dataloader)

tensor(0.7772, device='cuda:0')

**ALBERT**

In [10]:
from transformers import AlbertConfig, AlbertForTokenClassification

albert_config = AlbertConfig(hidden_size=256,
                        intermediate_size=1024,
                        max_length=40,
                        num_attention_heads=8,
                        num_hidden_layers=4,
                        position_biased_input=True,
                        relative_attention=True)

albert_model = AlbertForTokenClassification(albert_config)

In [11]:
pl_albert_model = AccentModel(albert_model, tokenizer, 1e-5)
checkpoint_callback = ModelCheckpoint(monitor='validation_loss', mode='min', save_top_k=3)
lr_monitor = LearningRateMonitor(logging_interval='step')
wandb_logger = WandbLogger(project='hw9_nn_kirichenko', name="albert_model_half_data_15epoch", job_type='train', save_dir='/kaggle/working/lightning_logs/version_1')
trainer = pl.Trainer(max_epochs=15, logger=wandb_logger, accelerator=device.type, callbacks=[checkpoint_callback, lr_monitor])

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model=pl_albert_model,
           train_dataloaders=train_dataloader,
           val_dataloaders=val_dataloader)

[34m[1mwandb[0m: Currently logged in as: [33mtoly-kiri4enko[0m ([33mbstu[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | AlbertForTokenClassification | 4.7 M 
-------------------------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.917    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=15` reached.


In [13]:
pl_albert_model.eval()
pl_albert_model.to(device)

AccentModel(
  (model): AlbertForTokenClassification(
    (albert): AlbertModel(
      (embeddings): AlbertEmbeddings(
        (word_embeddings): Embedding(30000, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0, inplace=False)
      )
      (encoder): AlbertTransformer(
        (embedding_hidden_mapping_in): Linear(in_features=128, out_features=256, bias=True)
        (albert_layer_groups): ModuleList(
          (0): AlbertLayerGroup(
            (albert_layers): ModuleList(
              (0): AlbertLayer(
                (full_layer_layer_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
                (attention): AlbertAttention(
                  (query): Linear(in_features=256, out_features=256, bias=True)
                  (key): Linear(in_features=256, out_features=256, bias=True)
         

In [12]:
def compute_accuracy(model, test_dataloader):
    accuracy = 0
    for i, (batch) in enumerate(test_dataloader):
        batch = batch.to(device)
        model_output = model(batch)
        accuracy += torch.sum(torch.argmax(model_output[:, :, 1], dim=1) == torch.argmax(batch['labels'], dim=1)) / len(torch.argmax(batch['labels'], dim=1))

    return accuracy / i

compute_accuracy(pl_albert_model, test_dataloader)

**BERT**

In [9]:
from transformers import BertConfig, BertForTokenClassification

bert_config = BertConfig(hidden_size=256,
                        intermediate_size=1024,
                        max_length=40,
                        num_attention_heads=8,
                        num_hidden_layers=4,
                        position_biased_input=True,
                        relative_attention=True)

bert_model = BertForTokenClassification(bert_config)

In [10]:
pl_bert_model = AccentModel(bert_model, tokenizer, 1e-5)
checkpoint_callback = ModelCheckpoint(monitor='validation_loss', mode='min', save_top_k=3)
lr_monitor = LearningRateMonitor(logging_interval='step')
wandb_logger = WandbLogger(project='hw9_nn_kirichenko', name="bert_model_half_data_10epoch", job_type='train', save_dir='/kaggle/working/lightning_logs/version_1')
trainer = pl.Trainer(max_epochs=15, logger=wandb_logger, accelerator=device.type, callbacks=[checkpoint_callback, lr_monitor])

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [11]:
trainer.fit(model=pl_bert_model,
           train_dataloaders=train_dataloader,
           val_dataloaders=val_dataloader)

[34m[1mwandb[0m: Currently logged in as: [33mtoly-kiri4enko[0m ([33mbstu[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | BertForTokenClassification | 11.1 M
-----------------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.421    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=15` reached.


In [14]:
pl_bert_model.eval()
pl_bert_model.to(device)

AccentModel(
  (model): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 256, padding_idx=0)
        (position_embeddings): Embedding(512, 256)
        (token_type_embeddings): Embedding(2, 256)
        (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-3): 4 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=256, out_features=256, bias=True)
                (key): Linear(in_features=256, out_features=256, bias=True)
                (value): Linear(in_features=256, out_features=256, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=256, out_features=256, bias=Tr

In [15]:
compute_accuracy(pl_bert_model, test_dataloader)

tensor(0.4521, device='cuda:0')

Deberta, Albert обучались с одинаковыми параметрами и одинаковое количество эпох на одинаковых данных. Bert пришлось обучить на урезанном количестве данных, так как google colab дропает обучение спустя полтора часа, а на kaggle закончились вычислительные единицы.

Точность моделей полученную при обучении моделей можно улучшить, если подобрать получше параметры для конфигов (например увеличить размер скрытых слоев или их количество).