In [None]:
import ast
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score
import optuna
import os

## 1. Загрузка и подготовка данных

In [None]:
raw_dataset = load_dataset("csv", data_files={"data": "/kaggle/input/resume/resume_dataset.csv"})

def process_labels(example):
    example["labels"] = ast.literal_eval(example["labels"])
    example["labels"] = [float(x) for x in example["labels"]]
    return example

dataset = raw_dataset["data"].map(process_labels)
print("Dataset size:", len(dataset))

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

## 2. Токенизация

In [None]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

## 3. Вычисление весов классов

In [None]:
def compute_class_weights(dataset):
    total = len(dataset)
    num_labels = len(dataset[0]["labels"])
    label_sums = np.zeros(num_labels)
    for example in dataset:
        label_sums += np.array(example["labels"])
    epsilon = 1e-8
    pos_weight = (total - label_sums) / (label_sums + epsilon)
    return torch.tensor(pos_weight, dtype=torch.float)

pos_weight = compute_class_weights(tokenized_datasets["train"])
print("pos_weight:", pos_weight)

## 4. Инициализация модели

In [None]:
def model_init():
    config = AutoConfig.from_pretrained(model_checkpoint, num_labels=34, problem_type="multi_label_classification")
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config)

# Глобальная переменная порога (будет обновляться в Optuna)
THRESHOLD = 0.45

## 5. Функция вычисления метрик

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    predictions = (probs > THRESHOLD).astype(int)
    micro_f1 = f1_score(labels, predictions, average="micro", zero_division=0)
    macro_f1 = f1_score(labels, predictions, average="macro", zero_division=0)
    # Вычисляем F1 по заданной формуле (используем eval_precision и eval_recall)
    precision = precision_score(labels, predictions, average="micro", zero_division=0)
    recall = recall_score(labels, predictions, average="micro", zero_division=0)
    if precision + recall > 0:
        f1_custom = 2 * (precision * recall) / (precision + recall)
    else:
        f1_custom = 0
    return {
         "eval_micro_f1": micro_f1,
         "eval_macro_f1": macro_f1,
         "eval_f1": f1_custom,  # Основной оптимизируемый показатель
         "eval_precision": precision,
         "eval_recall": recall
    }

## 6. Кастомный Trainer (без сохранения промежуточных чекпойнтов)

In [None]:
class MyTrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        if "labels" in inputs:
            inputs["labels"] = inputs["labels"].float()
        outputs = model(**inputs)
        logits = outputs.logits
        pos_weight_device = self.pos_weight.to(logits.device) if self.pos_weight is not None else None
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_device)
        loss = loss_fct(logits, inputs["labels"])
        return (loss, outputs) if return_outputs else loss

## 7. Оптимизация с помощью Optuna (1 trial, эпохи от 10 до 14)

In [None]:
def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
    weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
    batch_size = trial.suggest_categorical("batch_size", [16])
    num_train_epochs = trial.suggest_int("num_train_epochs", 10, 14)
    threshold = trial.suggest_uniform("threshold", 0.46, 0.48)
    
    global THRESHOLD
    THRESHOLD = threshold
    
    training_args_trial = TrainingArguments(
        output_dir="./results_trial",
        eval_strategy="epoch",
        save_strategy="no",  # Не сохраняем промежуточные чекпойнты
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        warmup_steps=500,
        logging_steps=10,
        fp16=True,
        report_to="none"
    )
    
    model = model_init()
    
    trainer_trial = MyTrainer(
        model=model,
        args=training_args_trial,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        pos_weight=pos_weight
    )
    
    trainer_trial.train()
    eval_results = trainer_trial.evaluate()
    f1_custom = eval_results["eval_f1"]
    print(f"Trial finished: f1: {f1_custom:.4f}, params: {trial.params}")
    return f1_custom

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

## 8. Обучение лучшей модели с найденными гиперпараметрами и сохранение только лучшей модели

In [None]:
best_lr = best_trial.params["learning_rate"]
best_wd = best_trial.params["weight_decay"]
best_bs = best_trial.params["batch_size"]
best_epochs = best_trial.params["num_train_epochs"]
best_thresh = best_trial.params["threshold"]

THRESHOLD = best_thresh

training_args_best = TrainingArguments(
    output_dir="./best_model_results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Сохраняем чекпойнт после каждой эпохи
    save_total_limit=1,     # Храним только лучший чекпойнт
    learning_rate=best_lr,
    per_device_train_batch_size=best_bs,
    per_device_eval_batch_size=best_bs,
    num_train_epochs=best_epochs,
    weight_decay=best_wd,
    warmup_steps=500,
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,  # После обучения загружается лучшая модель по метрике eval_f1
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to="none"
)

trainer_best = MyTrainer(
    model_init=model_init,
    args=training_args_best,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    pos_weight=pos_weight
)

trainer_best.train()
final_eval = trainer_best.evaluate()
print("Final evaluation results:", final_eval)

# Сохраняем только лучшую модель и токенизатор в папку "Rock"
best_model_save_path = "./Rock"
trainer_best.save_model(best_model_save_path)
tokenizer.save_pretrained(best_model_save_path)
print(f"Best model saved to {best_model_save_path}")

# Архивируем модель в zip-файл "Rock.zip"
!zip -r Rock.zip "{best_model_save_path}"
print("Rock.zip created. Download it from the Output tab in Kaggle.")

Generating data split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]

Dataset size: 727


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/581 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

[I 2025-04-07 21:19:21,082] A new study created in memory with name: no-name-0b553d53-fcda-4d8e-8819-b7109790cc4c


pos_weight: tensor([9.0056e-02, 3.0142e-02, 2.2833e-01, 2.6667e+01, 3.5391e+00, 9.3667e-01,
        0.0000e+00, 2.8950e+02, 1.1520e+02, 9.5833e+01, 1.1946e-01, 1.2261e+00,
        4.7462e-01, 6.3544e+00, 1.5507e-01, 2.3977e+00, 3.4542e-03, 6.0000e+00,
        7.5441e+00, 7.1831e+00, 1.9267e+02, 7.8030e+00, 2.6541e+00, 2.3200e+00,
        1.2379e-01, 5.3298e-01, 1.5600e+01, 1.2195e-02, 1.0857e+01, 5.0633e-02,
        9.3750e+00, 1.6606e+01, 3.4180e-01, 5.8100e+10])


  learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
  threshold = trial.suggest_uniform("threshold", 0.46, 0.48)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7749,0.80336,0.529515,0.397882,0.529515,0.390883,0.820526
2,0.8201,0.802669,0.529685,0.398018,0.529685,0.391068,0.820526
3,0.7511,0.806021,0.593634,0.408965,0.593634,0.473575,0.79524
4,0.771,0.798266,0.580411,0.415773,0.580411,0.445387,0.83292
5,0.7883,0.799872,0.60398,0.429615,0.60398,0.497909,0.767476
6,0.7555,0.799897,0.609932,0.426891,0.609932,0.49371,0.797719
7,0.7525,0.814325,0.614356,0.435988,0.614356,0.509126,0.774417
8,0.77,0.809771,0.645767,0.454893,0.645767,0.540622,0.801686
9,0.7886,0.804306,0.678496,0.462163,0.678496,0.591093,0.796232
10,0.7552,0.791959,0.67947,0.46204,0.67947,0.612415,0.763014




[I 2025-04-07 21:23:53,181] Trial 0 finished with value: 0.6954938552571689 and parameters: {'learning_rate': 1.9768352923720516e-05, 'weight_decay': 0.0013309036353077938, 'batch_size': 16, 'num_train_epochs': 12, 'threshold': 0.46269254347612143}. Best is trial 0 with value: 0.6954938552571689.


Trial finished: f1: 0.6955, params: {'learning_rate': 1.9768352923720516e-05, 'weight_decay': 0.0013309036353077938, 'batch_size': 16, 'num_train_epochs': 12, 'threshold': 0.46269254347612143}


  learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
  threshold = trial.suggest_uniform("threshold", 0.46, 0.48)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7789,0.792225,0.443621,0.305038,0.443621,0.344384,0.623203
2,0.8266,0.791836,0.443621,0.305038,0.443621,0.344384,0.623203
3,0.7585,0.795948,0.482553,0.368795,0.482553,0.362035,0.723352
4,0.7801,0.838335,0.583957,0.404741,0.583957,0.455886,0.812097
5,0.7958,0.838281,0.61479,0.418979,0.61479,0.488739,0.828458
6,0.7662,0.826467,0.630265,0.429782,0.630265,0.504927,0.838374
7,0.7602,0.824663,0.602195,0.422562,0.602195,0.481845,0.802677
8,0.7565,0.811195,0.629459,0.415227,0.629459,0.513294,0.813585
9,0.7951,0.822581,0.642353,0.425836,0.642353,0.541073,0.790283
10,0.7879,0.807964,0.676682,0.450097,0.676682,0.617683,0.748141




[I 2025-04-07 21:29:06,655] Trial 1 finished with value: 0.680952380952381 and parameters: {'learning_rate': 1.935522882724049e-05, 'weight_decay': 0.0012100309698722834, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.47022289494356106}. Best is trial 0 with value: 0.6954938552571689.


Trial finished: f1: 0.6810, params: {'learning_rate': 1.935522882724049e-05, 'weight_decay': 0.0012100309698722834, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.47022289494356106}


  learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
  threshold = trial.suggest_uniform("threshold", 0.46, 0.48)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7687,0.811689,0.536068,0.372062,0.536068,0.412741,0.764502
2,0.8243,0.811129,0.536602,0.372183,0.536602,0.41323,0.764998
3,0.7536,0.810476,0.536416,0.372062,0.536416,0.413009,0.764998
4,0.7826,0.805453,0.552078,0.381051,0.552078,0.437609,0.747645
5,0.8001,0.797942,0.623083,0.412506,0.623083,0.525349,0.765493
6,0.7655,0.802421,0.646513,0.450257,0.646513,0.516444,0.864155
7,0.7529,0.804853,0.649579,0.450671,0.649579,0.514949,0.879524
8,0.7636,0.794651,0.715002,0.465661,0.715002,0.617913,0.84829
9,0.7887,0.792752,0.709064,0.470259,0.709064,0.61782,0.831929
10,0.7581,0.781533,0.695452,0.45662,0.695452,0.647839,0.75062




[I 2025-04-07 21:32:51,181] Trial 2 finished with value: 0.6954524575103354 and parameters: {'learning_rate': 1.984973147517844e-05, 'weight_decay': 0.0011163000423725223, 'batch_size': 16, 'num_train_epochs': 10, 'threshold': 0.4795055992267837}. Best is trial 0 with value: 0.6954938552571689.


Trial finished: f1: 0.6955, params: {'learning_rate': 1.984973147517844e-05, 'weight_decay': 0.0011163000423725223, 'batch_size': 16, 'num_train_epochs': 10, 'threshold': 0.4795055992267837}


  learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
  threshold = trial.suggest_uniform("threshold", 0.46, 0.48)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7786,0.795593,0.512531,0.378553,0.512531,0.38523,0.765493
2,0.8248,0.795222,0.510785,0.379842,0.510785,0.382396,0.768964
3,0.7603,0.79456,0.502835,0.379025,0.502835,0.373436,0.76946
4,0.7816,0.79542,0.570731,0.389894,0.570731,0.459468,0.753099
5,0.7906,0.794237,0.605126,0.399301,0.605126,0.507558,0.749132
6,0.7534,0.790742,0.579825,0.392888,0.579825,0.483604,0.723847
7,0.7638,0.795398,0.618434,0.421323,0.618434,0.515192,0.773426
8,0.7558,0.79693,0.626327,0.421142,0.626327,0.547716,0.731284
9,0.79,0.793508,0.629007,0.411611,0.629007,0.574059,0.695588
10,0.7534,0.779116,0.677583,0.443494,0.677583,0.633463,0.728309




[I 2025-04-07 21:38:04,583] Trial 3 finished with value: 0.6792452830188679 and parameters: {'learning_rate': 1.775451565900352e-05, 'weight_decay': 0.0010565936969682268, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.4722450053273777}. Best is trial 0 with value: 0.6954938552571689.


Trial finished: f1: 0.6792, params: {'learning_rate': 1.775451565900352e-05, 'weight_decay': 0.0010565936969682268, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.4722450053273777}


  learning_rate = trial.suggest_loguniform("learning_rate", 1.5e-05, 2.5e-05)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.001, 0.002)
  threshold = trial.suggest_uniform("threshold", 0.46, 0.48)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7687,0.811706,0.533379,0.371515,0.533379,0.409562,0.764502
2,0.8243,0.811241,0.534372,0.371669,0.534372,0.410591,0.764998
3,0.7539,0.810753,0.535671,0.371898,0.535671,0.412126,0.764998
4,0.7824,0.808455,0.532653,0.370293,0.532653,0.412937,0.750124
5,0.7992,0.798293,0.609328,0.403607,0.609328,0.513965,0.748141
6,0.7698,0.803784,0.638646,0.438038,0.638646,0.511012,0.851264
7,0.7547,0.807223,0.645405,0.449447,0.645405,0.506935,0.887952
8,0.7659,0.805119,0.705287,0.461114,0.705287,0.596233,0.863163
9,0.79,0.799479,0.700141,0.464979,0.700141,0.588454,0.864155
10,0.7705,0.797151,0.707658,0.448714,0.707658,0.64837,0.77888




[I 2025-04-07 21:43:17,845] Trial 4 finished with value: 0.6803519061583578 and parameters: {'learning_rate': 1.6466692666350026e-05, 'weight_decay': 0.0012022180401906955, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.4791430511285427}. Best is trial 0 with value: 0.6954938552571689.


Trial finished: f1: 0.6804, params: {'learning_rate': 1.6466692666350026e-05, 'weight_decay': 0.0012022180401906955, 'batch_size': 16, 'num_train_epochs': 14, 'threshold': 0.4791430511285427}
Best trial:
  Value:  0.6954938552571689
  Params: 
    learning_rate: 1.9768352923720516e-05
    weight_decay: 0.0013309036353077938
    batch_size: 16
    num_train_epochs: 12
    threshold: 0.46269254347612143


  super().__init__(*args, **kwargs)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,F1,Precision,Recall
1,0.7689,0.813504,0.494814,0.363229,0.494814,0.373391,0.733267
2,0.8224,0.813217,0.496225,0.363229,0.496225,0.375,0.733267
3,0.7498,0.815925,0.516815,0.380462,0.516815,0.395172,0.746653
4,0.7735,0.819981,0.548018,0.397034,0.548018,0.425096,0.770947
5,0.7983,0.81658,0.574476,0.41043,0.574476,0.452063,0.787804
6,0.7547,0.815677,0.56686,0.417941,0.56686,0.447376,0.773426
7,0.7487,0.819297,0.590204,0.441485,0.590204,0.457775,0.830441
8,0.7593,0.815938,0.666923,0.462913,0.666923,0.544371,0.860684
9,0.7928,0.809999,0.694073,0.489557,0.694073,0.580828,0.862172
10,0.7548,0.791652,0.691312,0.487775,0.691312,0.590112,0.834408




Final evaluation results: {'eval_micro_f1': 0.6940730393135103, 'eval_macro_f1': 0.4895570593968406, 'eval_f1': 0.6940730393135103, 'eval_precision': 0.5808283233132933, 'eval_recall': 0.8621715418939019, 'eval_loss': 0.8099992275238037, 'eval_runtime': 1.6517, 'eval_samples_per_second': 88.393, 'eval_steps_per_second': 3.027, 'epoch': 12.0}
Best model saved to ./Rock
  adding: Rock/ (stored 0%)
  adding: Rock/config.json (deflated 67%)
  adding: Rock/sentencepiece.bpe.model (deflated 49%)
  adding: Rock/training_args.bin (deflated 51%)
  adding: Rock/special_tokens_map.json (deflated 52%)
  adding: Rock/model.safetensors (deflated 30%)
  adding: Rock/tokenizer.json (deflated 76%)
  adding: Rock/tokenizer_config.json (deflated 76%)
Rock.zip created. Download it from the Output tab in Kaggle.


In [2]:
print("Current THRESHOLD:", THRESHOLD)

Current THRESHOLD: 0.46269254347612143
