In [109]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
data = pd.read_csv('/kaggle/input/usefull-bigger/usefull_bigger.csv')
model_name = "cointegrated/rubert-tiny2"
device = 'cuda' if torch.cuda.is_available else 'cpu'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=11).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
X = list(data["Текст сообщения"])
y = list(data["Разметка"])

In [66]:
usefull = {2:0,3:1,4:2,5:3,6:4,7:5,8:6,9:7,10:8,11:9,18:10}

In [67]:
for i in range(len(y)):
    y[i] = usefull[y[i]]

In [70]:
# X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state = 20222022,stratify=y, shuffle=True)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=64)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=64)

In [71]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [110]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average = 'macro')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="/output",
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=128,
    num_train_epochs=90,
    weight_decay=0.005,
    learning_rate = 6e-5,
    warmup_ratio=0.074,
    lr_scheduler_type = 'cosine',
    seed=20222022,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,    
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.989625,0.276978,0.187964,0.130081,0.15843
100,No log,1.44426,0.571942,0.422069,0.417964,0.552734
150,No log,0.965819,0.730216,0.516047,0.527816,0.70589
200,No log,0.71318,0.794964,0.652058,0.597923,0.778369
250,No log,0.618004,0.834532,0.732681,0.675255,0.823557
300,No log,0.612347,0.841727,0.856754,0.736925,0.836394
350,No log,0.608174,0.852518,0.893774,0.76505,0.848474
400,No log,0.614284,0.852518,0.890792,0.767477,0.848317
450,No log,0.616891,0.852518,0.891505,0.784251,0.849517
500,0.628200,0.611259,0.852518,0.891231,0.784251,0.849571


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=630, training_loss=0.5116640893239823, metrics={'train_runtime': 56.6777, 'train_samples_per_second': 2491.457, 'train_steps_per_second': 11.115, 'total_flos': 130316721696000.0, 'train_loss': 0.5116640893239823, 'epoch': 90.0})

In [23]:
# model.save_pretrained('bin(importance_classification)')

In [75]:
X_test = tokenizer(['простите я не поняла где над скачать =('], padding=True, truncation=True, max_length=128)

In [76]:
test_dataset = Dataset(X_test)

In [117]:
pred = []
y_true = []

for i in range(len(val_dataset)):
    data = val_dataset[i]  # Получаем i-ый пример из датасета
    true = data.pop('labels').item()  # Извлекаем метки из данных
    y_true.append(true)
    raw_pred = val_trainer.predict([data])  # Предсказываем метки для данного примера
    y_pred = torch.argmax(torch.softmax(torch.tensor(raw_pred.predictions), dim=-1), axis=1).item()  # Получаем предсказанные метки
    pred.append(y_pred)


In [118]:
f1_score(y_true=y_true, y_pred=pred, average = 'weighted')

0.8495705011934523

In [107]:
model.save_pretrained('0.86(usefull_classification)')
tokenizer.save_pretrained('0.86(usefull_classification)')

('0.86(usefull_classification)/tokenizer_config.json',
 '0.86(usefull_classification)/special_tokens_map.json',
 '0.86(usefull_classification)/vocab.txt',
 '0.86(usefull_classification)/added_tokens.json')

In [108]:
import os
import zipfile
tokenizer_path = '/kaggle/working/0.86(usefull_classification)'
model_path = '/kaggle/working/0.86(usefull_classification)'

archive_name = "0.86(usefull_classification).zip"
with zipfile.ZipFile(archive_name, "w") as archive:
    # Add model files
    archive.write(os.path.join(model_path, "config.json"), "config.json")
    archive.write(os.path.join(model_path, "model.safetensors"), "model.safetensors")
    # Add tokenizer files
    archive.write(os.path.join(tokenizer_path, "vocab.txt"), "vocab.txt")
    archive.write(os.path.join(tokenizer_path, "special_tokens_map.json"), "special_tokens_map.json")
    archive.write(os.path.join(tokenizer_path, "tokenizer_config.json"), "tokenizer_config.json")