### В этой ЛР просто обучим классификатор над предобученной моделью-эмбедерром

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from src.logger import logger

  from .autonotebook import tqdm as notebook_tqdm
2026-01-11 12:15:09.636210: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):


### Загрузим токенизатор и модель, заморозим ее параметры кроме головы-классификатора

In [2]:
model_name = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5
)

for param in model.base_model.parameters():
    param.requires_grad = False

logger.info(f"Количество обучаемых параметров: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2026-01-11 12:15:13,619 - src.logger - 11 - INFO - Количество обучаемых параметров: 594,437


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

### Подготовим данные

In [3]:
df = pd.read_csv('labs_data/lab3_5/Documents topics (Politics 0, Sport 1, Technology 2, Entertainment 3, Business 4).csv')
df = df.dropna()
df

texts = df['token'].tolist()
labels = df['label'].values

texts_temp, texts_test, y_temp, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)
texts_train, texts_val, y_train, y_val = train_test_split(texts_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = TextDataset(texts_train, y_train, tokenizer)
val_dataset = TextDataset(texts_val, y_val, tokenizer)
test_dataset = TextDataset(texts_test, y_test, tokenizer)

### Обучим модель

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=True,
    learning_rate=1e-3,
    weight_decay=1e-5,
    lr_scheduler_type="reduce_lr_on_plateau",
    dataloader_pin_memory=True,
    disable_tqdm=False,
    logging_strategy="epoch",
    logging_steps=1,  
    seed=42,
    fp16=torch.cuda.is_available(),
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = {'accuracy': accuracy_score(labels, predictions)}
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    metrics["precision"] = precision
    metrics["recall"] = recall
    metrics["f1"] = f1

    return metrics

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3,
                                    early_stopping_threshold=0000.1)]

)

logger.info("Обучение модели (только голова классификатора)...")
trainer.train()

2026-01-11 12:17:05,598 - src.logger - 44 - INFO - Обучение модели (только голова классификатора)...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0815,0.254529,0.925843,0.935028,0.925843,0.926469
2,0.1074,0.257168,0.932584,0.938957,0.932584,0.930395
3,0.0793,0.122527,0.970787,0.970957,0.970787,0.970822
4,0.0642,0.201327,0.941573,0.944462,0.941573,0.940801


TrainOutput(global_step=168, training_loss=0.08310828038624354, metrics={'train_runtime': 61.3213, 'train_samples_per_second': 653.118, 'train_steps_per_second': 20.548, 'total_flos': 1405050880757760.0, 'train_loss': 0.08310828038624354, 'epoch': 4.0})

### Оценим модель

In [12]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
test_accuracy = test_results['eval_accuracy']
logger.info(f"Точность  на тестовой выборке: {test_accuracy:.4f}")
logger.info(f"F1-мера  на тестовой выборке: {test_results['eval_f1']:.4f}")

2026-01-11 12:20:35,249 - src.logger - 3 - INFO - Точность  на тестовой выборке: 0.9146
2026-01-11 12:20:35,249 - src.logger - 4 - INFO - F1-мера  на тестовой выборке: 0.9111


### По итогу модель уже после 1 эпохи имеет достаточное качество классификации благодаря знаниям, уже заложеным в базовую модель-эмбеддер. Так что тут большого обучения и не требуется. Это в очередной раз доказывает эффективность архитектуры трансформеров