In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# train_classifier.py
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# ==== 1. Загружаем данные ====

DATA_PATH = "/content/letters.csv"       # путь к твоему датасету
MODEL_NAME = "DeepPavlov/rubert-base-cased"
OUTPUT_DIR = "./rubert-letter-classifier-final"   # сюда сохраним модель

df = pd.read_csv(DATA_PATH, sep=";")  # если разделитель запятая, поменяй на sep=","

# ожидаем колонки text и label
assert "text" in df.columns and "label" in df.columns, "Нужны колонки 'text' и 'label'"

# ==== 2. Коды классов ====

unique_labels = sorted(df["label"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)

print("Классы:")
for k, v in label2id.items():
    print(f"{v}: {k}")

# ==== 3. Train/Val split ====

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label_id"],
    random_state=42,
)

train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
val_ds = Dataset.from_pandas(val_df[["text", "label_id"]])

# убираем автоматический индекс, если есть
if "__index_level_0__" in train_ds.column_names:
    train_ds = train_ds.remove_columns(["__index_level_0__"])
if "__index_level_0__" in val_ds.column_names:
    val_ds = val_ds.remove_columns(["__index_level_0__"])

# ==== 4. Токенизация ====

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

train_ds = train_ds.map(tokenize_batch, batched=True)
val_ds = val_ds.map(tokenize_batch, batched=True)

train_ds = train_ds.rename_column("label_id", "labels")
val_ds = val_ds.rename_column("label_id", "labels")

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ==== 5. Модель ====

num_labels = len(unique_labels)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# ==== 6. Метрики ====

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# ==== 7. Параметры обучения (упрощённые) ====

training_args = TrainingArguments(
    output_dir="./rubert-letter-classifier-checkpoints",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    # старые версии transformers могут не знать evaluation_strategy/save_strategy/...
)

# ==== 8. Trainer ====

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ==== 9. Обучение ====

trainer.train()

# опционально: оценка на валидации
metrics = trainer.evaluate()
print("Validation metrics:", metrics)

# ==== 10. Сохранение модели ====

os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Модель сохранена в {OUTPUT_DIR}")
print("id2label:", id2label)


Классы:
0: Запрос информации/документов
1: Запрос на согласование
2: Официальная жалоба или претензия
3: Партнёрское предложение
4: Регуляторный запрос
5: Уведомление или информирование


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
10,1.7742
20,1.5881
30,1.3107
40,1.0555
50,0.8081
60,0.6741
70,0.6166




Validation metrics: {'eval_loss': 0.5153608918190002, 'eval_accuracy': 0.98, 'eval_runtime': 130.0009, 'eval_samples_per_second': 1.154, 'eval_steps_per_second': 0.146, 'epoch': 1.0}
Модель сохранена в ./rubert-letter-classifier-final
id2label: {0: 'Запрос информации/документов', 1: 'Запрос на согласование', 2: 'Официальная жалоба или претензия', 3: 'Партнёрское предложение', 4: 'Регуляторный запрос', 5: 'Уведомление или информирование'}
