In [20]:
!pip install datasets



In [21]:
import os
import re
import time
import math
import pandas as pd
import numpy as np
import torch
import ast

from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score

In [22]:
# Загружаем датасет из CSV-файла (предполагается, что файл находится в рабочей директории)
raw_dataset = load_dataset("csv", data_files={"data": "/kaggle/input/resume-dataset-csv/resume_dataset.csv"})

# Преобразуем столбец "labels" (представленный в виде строки) в список чисел (float)
def process_labels(example):
    example["labels"] = ast.literal_eval(example["labels"])
    example["labels"] = [float(x) for x in example["labels"]]
    return example

dataset = raw_dataset["data"].map(process_labels)
print("Размер датасета:", len(dataset))

Размер датасета: 727


## Разбиение датасета и токенизация

In [23]:
# Разбиваем датасет на обучающую и валидационную выборки (80/20)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Загружаем токенизатор для модели XLM-RoBERTa
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

## Вычисление pos_weight для балансировки классов

In [24]:
def compute_class_weights(dataset):
    total = len(dataset)
    num_labels = len(dataset[0]["labels"])
    label_sums = np.zeros(num_labels)
    for example in dataset:
        label_sums += np.array(example["labels"])
    epsilon = 1e-8
    pos_weight = (total - label_sums) / (label_sums + epsilon)
    return torch.tensor(pos_weight, dtype=torch.float)

# Вычисляем pos_weight для обучающего датасета
pos_weight = compute_class_weights(tokenized_datasets["train"])
print("pos_weight:", pos_weight)

pos_weight: tensor([9.0056e-02, 3.0142e-02, 2.2833e-01, 2.6667e+01, 3.5391e+00, 9.3667e-01,
        0.0000e+00, 2.8950e+02, 1.1520e+02, 9.5833e+01, 1.1946e-01, 1.2261e+00,
        4.7462e-01, 6.3544e+00, 1.5507e-01, 2.3977e+00, 3.4542e-03, 6.0000e+00,
        7.5441e+00, 7.1831e+00, 1.9267e+02, 7.8030e+00, 2.6541e+00, 2.3200e+00,
        1.2379e-01, 5.3298e-01, 1.5600e+01, 1.2195e-02, 1.0857e+01, 5.0633e-02,
        9.3750e+00, 1.6606e+01, 3.4180e-01, 5.8100e+10])


## Определение модели, функции model_init и compute_metrics

In [25]:
# Создаём конфигурацию модели для multi-label классификации с 34 метками
config = AutoConfig.from_pretrained(model_checkpoint, num_labels=34, problem_type="multi_label_classification")

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config)

# Устанавливаем порог для классификации равным 0.5
THRESHOLD = 0.5

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    predictions = (probs > THRESHOLD).astype(int)
    micro_f1 = f1_score(labels, predictions, average="micro", zero_division=0)
    macro_f1 = f1_score(labels, predictions, average="macro", zero_division=0)
    composite_f1 = 2 * micro_f1 * macro_f1 / (micro_f1 + macro_f1) if (micro_f1 + macro_f1) > 0 else 0
    precision = precision_score(labels, predictions, average="micro", zero_division=0)
    recall = recall_score(labels, predictions, average="micro", zero_division=0)
    return {
        "eval_micro_f1": micro_f1,
        "eval_macro_f1": macro_f1,
        "eval_composite_f1": composite_f1,
        "eval_precision": precision,
        "eval_recall": recall
    }

## Определение кастомного Trainer (MyTrainer) с балансировкой классов

In [26]:
class MyTrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Приводим метки к float
        if "labels" in inputs:
            inputs["labels"] = inputs["labels"].float()
        outputs = model(**inputs)
        logits = outputs.logits
        # Переносим pos_weight на устройство логитов (GPU или CPU)
        pos_weight_device = self.pos_weight.to(logits.device) if self.pos_weight is not None else None
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_device)
        loss = loss_fct(logits, inputs["labels"])
        return (loss, outputs) if return_outputs else loss

## Определение фиксированных гиперпараметров и TrainingArguments

In [27]:
# Новые гиперпараметры
best_learning_rate = 5e-05
best_batch_size = 32
best_num_train_epochs = 10
best_weight_decay = 0.001

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",         # Автоматическое сохранение чекпойнтов по эпохам
    save_total_limit=1,            # Храним только один (лучший) чекпойнт
    learning_rate=best_learning_rate,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_num_train_epochs,
    weight_decay=best_weight_decay,
    warmup_steps=500,              # Плавное увеличение learning rate
    logging_steps=10,              # Частое логирование для контроля обучения
    logging_strategy="steps",
    fp16=True,                   # Используем смешанную точность для экономии памяти
    load_best_model_at_end=True,   # После обучения автоматически загружается лучший чекпойнт
    metric_for_best_model="eval_composite_f1",  # Лучшая модель выбирается по composite_f1
    report_to="none"
)



## Финальное обучение и сохранение модели

In [28]:
trainer = MyTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    pos_weight=pos_weight  # Передаем вычисленные веса для балансировки классов
)

# Обучаем модель
trainer.train()

# Оцениваем модель на валидационном наборе
eval_results = trainer.evaluate()
print("Final evaluation results:")
print(eval_results)

  super().__init__(*args, **kwargs)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Composite F1,Precision,Recall
1,0.7868,0.813079,0.362034,0.18561,0.245405,0.367775,0.35647
2,0.7696,0.81251,0.37011,0.189225,0.250419,0.374429,0.36589
3,0.7764,0.811741,0.374842,0.196537,0.257868,0.38208,0.367873
4,0.7768,0.809622,0.407891,0.218808,0.284825,0.40579,0.410015
5,0.7646,0.800886,0.42365,0.251271,0.315447,0.41763,0.429846
6,0.7568,0.800409,0.382979,0.219834,0.27933,0.377896,0.3882
7,0.7411,0.799118,0.4415,0.283633,0.345382,0.445285,0.437779
8,0.7474,0.79459,0.387081,0.239695,0.296059,0.384993,0.389192
9,0.7443,0.79462,0.442569,0.2546,0.323244,0.490646,0.403074
10,0.7446,0.777861,0.447205,0.326764,0.377613,0.519344,0.392662




Final evaluation results:
{'eval_micro_f1': 0.44720496894409945, 'eval_macro_f1': 0.32676359266905364, 'eval_composite_f1': 0.37761301830413546, 'eval_precision': 0.519344262295082, 'eval_recall': 0.3926623698562221, 'eval_loss': 0.7778606414794922, 'eval_runtime': 2.8767, 'eval_samples_per_second': 50.753, 'eval_steps_per_second': 1.043, 'epoch': 10.0}


# Ручное сохранение лучшей модели (которая загружена в trainer.model) и токенизатора
best_model_save_path = r"C:\Users\HP\Desktop\Хакатоны\ГазпромНефть_сервис_для_HR\Дообученная_модель"
trainer.save_model(best_model_save_path)
tokenizer.save_pretrained(best_model_save_path)
print(f"Best model saved to {best_model_save_path}")

# Архивирование папки с моделью в zip-файл для скачивания (Kaggle Output)
!zip -r best_model.zip "{best_model_save_path}"
print("best_model.zip created. Download it from the Output tab in Kaggle.")