## Импорты и загрузка модели


In [2]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer)
from sklearn.metrics import f1_score

2025-06-14 14:29:20.524150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749911360.712142      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749911360.764653      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Скачивание и подготовка данных

In [3]:
!mkdir -p ~/.kaggle
!mv /kaggle/input/kaggle_json/other/default/1/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot remove '/kaggle/input/kaggle_json/other/default/1/kaggle.json': Read-only file system


In [4]:
!kaggle competitions download -c ml-dl-practice

In [5]:
!unzip -q ml-dl-practice.zip

In [6]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
#df_train.head()

In [7]:
from sklearn.model_selection import train_test_split

# разобьём train на train/val
df_train_short, df_val = train_test_split(df_train, test_size=0.2, stratify=df_train.label, random_state=42)

In [8]:
# Fine-tuning RuBERT (DeepPavlov/rubert-base-cased)

# Конвертация в HuggingFace Dataset
ds_train = Dataset.from_pandas(df_train_short[['text','label']], preserve_index=False)
ds_val   = Dataset.from_pandas(df_val[['text','label']], preserve_index=False)
ds_test  = Dataset.from_pandas(df_test[['text']], preserve_index=False)

ds = DatasetDict({
    'train': ds_train,
    'validation': ds_val,
    'test': ds_test
})

model_name = "deepvk/RuModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at deepvk/RuModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_fn(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

ds = ds.map(
    tokenize_fn,
    batched=True
)

Map:   0%|          | 0/192127 [00:00<?, ? examples/s]

Map:   0%|          | 0/48032 [00:00<?, ? examples/s]

Map:   0%|          | 0/60040 [00:00<?, ? examples/s]

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = labels.numpy() if isinstance(labels, torch.Tensor) else labels
    f1 = f1_score(labels, preds)
    return {"f1": f1}

## Обучение и выгрузка модели

In [14]:
# Объединяем train и validation
full_train_dataset = concatenate_datasets([ds['train'], ds['validation']])

# Параметры тренировки с сохранением после каждой эпохи
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=3750,
    save_strategy="epoch",
    logging_dir='./logs',
    report_to="none",                   # без внешних логгеров
    output_dir="bad_words-RuModernBERT-base", 
    push_to_hub=True
)

# Trainer без валидации
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset
)

In [15]:
trainer.train()

Step,Training Loss
3750,0.0431
7500,0.0215
11250,0.0089


TrainOutput(global_step=11259, training_loss=0.02448954126343577, metrics={'train_runtime': 10822.0456, 'train_samples_per_second': 66.575, 'train_steps_per_second': 1.04, 'total_flos': 6.137704765601741e+16, 'train_loss': 0.02448954126343577, 'epoch': 3.0})

In [16]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/MesserMMP/bad_words-RuModernBERT-base/commit/18bb838f20ae9c57306f0337a26c6dc104fcae55', commit_message='End of training', commit_description='', oid='18bb838f20ae9c57306f0337a26c6dc104fcae55', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MesserMMP/bad_words-RuModernBERT-base', endpoint='https://huggingface.co', repo_type='model', repo_id='MesserMMP/bad_words-RuModernBERT-base'), pr_revision=None, pr_num=None)

In [17]:
tokenizer.push_to_hub("bad_words-RuModernBERT-base")

README.md:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MesserMMP/bad_words-RuModernBERT-base/commit/23669966e16ff6c0b8d8f2f5fa24897361fc312d', commit_message='Upload tokenizer', commit_description='', oid='23669966e16ff6c0b8d8f2f5fa24897361fc312d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MesserMMP/bad_words-RuModernBERT-base', endpoint='https://huggingface.co', repo_type='model', repo_id='MesserMMP/bad_words-RuModernBERT-base'), pr_revision=None, pr_num=None)

## Инференс и сохранение результатов

In [23]:
preds_output = trainer_rubert.predict(ds['test'])
preds = preds_output.predictions.argmax(axis=-1)

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Found Tesla P100-PCIE-16GB which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability 6.0

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [19]:
df_test = pd.read_csv('test.csv')
df_test['label'] = preds
df_test[['ID', 'label']].to_csv("submission_RuModernBERT-base7406.csv", index=False)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/bad_words-RuModernBERT-base/checkpoint-7506")

# Trainer без валидации
trainer_rubert = Trainer(
    model=model,
)

In [37]:
preds_output_2 = trainer.predict(ds['test'])

In [38]:
test_logits = [preds_output.predictions, preds_output_2.predictions]

In [40]:
import numpy as np

avg_logits = np.mean(test_logits, axis=0)
preds  = np.argmax(avg_logits, axis=-1)

In [11]:
from huggingface_hub import login

# Введите ваш токен API
token = "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"

# Вход в Hugging Face Hub
login(token)