In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("✅ DEVICE:", DEVICE)

✅ DEVICE: cuda


In [4]:
"""
Загрузка и подготовка данных:
    ALLOWED_LABELS -> список доступных классов (на тот случай если попадуться грязные метки)
    label_encoder -> кодировка дэйблов
    assert (...) -> метод для обнаружения лишних классов
"""

trainDf = pd.read_csv("/home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/research/labeledProfanity.csv")
trainDf["labels"] = trainDf["labels"].str.lower().str.strip()

ALLOWED_LABELS = ["profanity", "non profanity"]
trainDf = trainDf[trainDf["labels"].isin(ALLOWED_LABELS)].copy()

print("✅ Чистые метки:", trainDf["labels"].unique())

label_encoder = LabelEncoder()
label_encoder.fit(ALLOWED_LABELS)
trainDf["labels"] = label_encoder.transform(trainDf["labels"])

assert set(trainDf["labels"].unique()) == {0, 1}, "❌ Обнаружены лишние классы!"

"""Разделение на train/val"""
train_data, val_data = train_test_split(trainDf, test_size=0.1, stratify=trainDf["labels"], random_state=42)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

✅ Чистые метки: ['profanity' 'non profanity']


In [5]:
"""
Токенизация:
    modelRubert -> инициализация модели
    tokenizer -> загрузка предобученной модели
    tokenize_function -> метод токенизации
"""

modelRubert = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(modelRubert)

def tokenize_function(example):
    return tokenizer(
        example["words"],
        truncation=True,
        padding="max_length",
        max_length=16
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 103734/103734 [00:01<00:00, 58998.15 examples/s]
Map: 100%|██████████| 11527/11527 [00:00<00:00, 46015.14 examples/s]


In [6]:
"""
Изменение модели
    AutoModelForSequenceClassification -> переводим на задачу классификации текста (меняем голову на два класса)
"""
model = AutoModelForSequenceClassification.from_pretrained(
    modelRubert,
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
"""
Метрики
    computeMetrics -> вычисляем метрики нашей модели
"""
def computeMetrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [8]:
"""
Обучение
    training_args -> инициализация тренировочных аргументов
"""
training_args = TrainingArguments(
    output_dir="/home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/research/model-bert/rubert-obscene-detector",
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="/home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/research/model-bert/logs",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    num_train_epochs=4,
    fp16=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True
)



In [9]:
"""
Обучение
    Trainer -> передача данных для обучения
"""
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=computeMetrics
)

  trainer = Trainer(


In [10]:
"""
Начало обучения
    trainer.train() -> начало обучения
    trainer.evaluate() -> расчёт метрик
"""
trainer.train()
metrics = trainer.evaluate()
print("\n📊 Final Validation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0667,0.008809,0.998178,0.997919,0.998438,0.998179
2,0.011,0.005303,0.998872,0.998267,0.999479,0.998873
3,0.0053,0.004948,0.998699,0.997576,0.999826,0.9987
4,0.0028,0.004945,0.998785,0.997749,0.999826,0.998787


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


📊 Final Validation Metrics:
eval_loss: 0.0049
eval_accuracy: 0.9988
eval_precision: 0.9977
eval_recall: 0.9998
eval_f1: 0.9988
eval_runtime: 10.1209
eval_samples_per_second: 1138.9260
eval_steps_per_second: 142.3780
epoch: 4.0000


In [25]:
"""
Сохранение модели:

"""
saveBertPath = '/home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/app/back/models/bert/model.pt'

torch.save({
    "model_state_dict": model.state_dict(),
    "model_class": model.__class__,
    "model_config": model.config,
    "tokenizer": tokenizer,
}, saveBertPath)

print(f"\n💾 Модель успешно сохранена по пути: {saveBertPath}")


💾 Модель успешно сохранена по пути: /home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/app/back/models/bert/model.pt


In [15]:
print(trainDf['labels'].value_counts())

labels
0    57640
1    57621
Name: count, dtype: int64


In [16]:
modelEval = trainer.model
modelEval.eval()
modelEval.to(DEVICE)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [19]:
import torch.nn.functional as F

In [20]:
# tokenizer у тебя уже загружен как переменная `tokenizer`
id2label = {0: "nonprofanity", 1: "profanity"}

def predict(text: str):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=16
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][pred_id].item()

    return {
        "label": id2label[pred_id],
        "confidence": round(confidence, 4)
    }


In [21]:
# 🧪 Пример:
word = "сололевел"
result = predict(word)
print(f"🧠 Класс: {result['label']}, уверенность: {result['confidence']}")

🧠 Класс: nonprofanity, уверенность: 1.0


In [30]:
import torch
from transformers import AutoModelForSequenceClassification

# Путь к .pt файлу
load_path = '/home/retro0/cyberspace/projects/full-stack/diplom-2025/mediaModeration/app/back/models/bert/model.pt'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Загружаем чекпоинт
checkpoint = torch.load(load_path, map_location=DEVICE)

# Воссоздаём модель и загружаем веса
model = AutoModelForSequenceClassification.from_config(checkpoint["model_config"])
model.load_state_dict(checkpoint["model_state_dict"])
model.to(DEVICE)
model.eval()

# Загружаем токенизатор
tokenizer = checkpoint["tokenizer"]

# Сопоставление id -> метка
id2label = {0: "nonprofanity", 1: "profanity"}

  checkpoint = torch.load(load_path, map_location=DEVICE)


In [None]:

        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=16
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    return id2label[predicted_class_id]

text = "мамзель"
print(f"0- non, 1 - profanity: {text} is {predict_label(text)}")

0- non, 1 - profanity: мамзель is nonprofanity


In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()