In [2]:
!pip install transformers

In [3]:
colab=False

if colab:
    from google.colab import drive
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim

import time
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [4]:
MODE_RU = True

In [5]:
model_name = 'DeepPavlov/rubert-base-cased' if MODE_RU else 'bert-base-multilingual-cased'

In [6]:
if colab:
    drive.mount('/content/drive/')

    dir = 'drive/MyDrive/BS/DATA_EXTRACTION/'
    corp_cased = dir + 'corp_cased.csv'
else:
    corp_cased = '/kaggle/input/corp-cased/corp_cased.csv'

In [7]:
df = pd.read_csv(corp_cased, sep='\t', header=None, on_bad_lines='skip')
df.dropna(inplace=True)

df.head()

In [8]:
sentences = df[0].to_numpy()
tags = df[1].to_numpy()

In [9]:
sentences = np.array(list(map(lambda x: str(x).split(), sentences)))
tags = np.array(list(map(lambda x: str(x).split(), tags)))

In [10]:
def build_voc_t(ttoi):
    idx = 0
    
    for tags_ in tags:
        for tag in tags_:
            if tag not in ttoi:
                ttoi[tag] = idx
                idx += 1

def creator(x, y, ttoi):
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            x_elem = []
            #word before
            if j != 0:
                x_elem.append(sentences[i][j - 1])

            #current word
            x_elem.append(sentences[i][j])

            #word after
            if j != len(sentences[i]) - 1:
                x_elem.append(sentences[i][j + 1])

            x.append(' '.join(x_elem))
            y.append(ttoi[tags[i][j]])

In [11]:
ttoi = {}
x = []
y = []

build_voc_t(ttoi)

creator(x, y, ttoi)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, stratify=y, shuffle=True)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
tokenizer = BertTokenizer.from_pretrained(model_name, max_length=512, do_lower_case=False)

In [15]:
x_train_enc = tokenizer(x_train, truncation=True, padding=True, max_length=512)
x_test_enc = tokenizer(x_test, truncation=True, padding=True, max_length=512)

In [16]:
temp_ids = x_train_enc['input_ids'][0]

print(temp_ids)
print(tokenizer.decode(temp_ids))

In [17]:
class PosTagDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor([self.labels[idx]])
        return item

In [18]:
dataset_train = PosTagDataset(x_train_enc, y_train)
dataset_test = PosTagDataset(x_test_enc, y_test)

In [19]:
print(dataset_train.__getitem__(0))

In [20]:
cr_labels = []
cr_names = []

for name, label in ttoi.items():
    cr_labels.append(label)
    cr_names.append(name)

print(cr_labels)
print(cr_names)

In [21]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(ttoi))
model.to(device)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Модель имеет {count_parameters(model):,} обучаемых параметров')

In [23]:
def compute_metrics(y_pred):
    y_true = y_pred.label_ids
    y_pred = y_pred.predictions.argmax(-1)
    cl_rep = classification_report(y_pred, y_true, labels=cr_labels, target_names=cr_names) # accuracy_score - функция из sklearn.metrics
    return {'classification report': cl_rep}

In [27]:
training_args = TrainingArguments(
    output_dir = 'results/',
    num_train_epochs = 5, # Число эпох
    per_device_train_batch_size = 8, # Размеры пакетов обучения и оценки
    per_device_eval_batch_size = 8,
    warmup_steps = 100, # Шаг выдачи предупреждений
    max_steps = 3000,
    weight_decay = 0.01, # Коэффициент уменьшения весов
    load_best_model_at_end = True, # Флаг загрузки лучшей модели после завершения обучения
    logging_steps = 500, # Шаг сохранения весов (checkpoint)
    evaluation_strategy = 'steps' # Стратегия обучения
)

In [28]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train,
    eval_dataset = dataset_test,
    compute_metrics = compute_metrics
)

In [29]:
trainer.train() # Обучение

In [30]:
trainer.evaluate() # Оценка