In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Загрузка данных из CSV
df = pd.read_csv("labeled_rutoxic.csv", delimiter=',', header=0, names=['sentence', 'label'])

In [6]:
print('В наборе предложений: {:,}\n'.format(df.shape[0]))

В наборе предложений: 14,412



In [7]:
# Пример
df.sample(10)

Unnamed: 0,sentence,label
14389,"Как-будто, одно исключает другое. А ненавидеть...",1.0
6570,Да. Не совком а лопатой. Или совковой лопатой.\n,1.0
3544,"может я дибил, но когда тебе говорят иди в жоп...",1.0
4080,"ну, кстати, у Нивы новой куча болячек есть. Вс...",0.0
3539,И все это нужно для вывода бабла из инвесторов...,0.0
1250,Может вас просто собаки бесят?) меня не собаки...,0.0
5851,А вот и отсылочки к фильмам для дегенератов по...,1.0
8907,"Вопрос не в том, откуда идет тепло (очевидно, ...",0.0
14280,Напомните при союзе какие налоги были?\n,0.0
7404,Там подвеска вроде с одним контактным проводом...,0.0


In [8]:
print('toxic:', df[df['label'] > 0]['label'].count())
print('not toxic:', df[df['label'] < 1]['label'].count())

toxic: 4826
not toxic: 9586


In [9]:
# Разделение на обучающую и тестовую выборки
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [10]:
print(train_df.shape)
train_df

(11529, 2)


Unnamed: 0,sentence,label
0,"Я последнюю серию специально не смотрю, для ме...",0.0
1,"Это тяжёлая работа, но кто-то должен её делать...",0.0
2,"Рецептурные. Вообще у них много названий, само...",0.0
3,горные обезьяны это типа киргизы что ли?\n,1.0
4,"У меня он в холодильнике хранился, не завонял....",0.0
...,...,...
11524,Ну не знаю. А откуда тогда машин столько у люд...,0.0
11525,А у меня кандидатка завалилась на электрохроми...,1.0
11526,Второй сезон полная хуета!\n,0.0
11527,"Мда... Хотел просто спасибо сказать, но и тут ...",1.0


In [11]:
test_df.shape

(2883, 2)

In [12]:
# Определение класса Dataset для загрузки данных
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [13]:
# Инициализация токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Создание DataLoader для обучающей и тестовой выборок
train_dataset = ToxicCommentDataset(train_df['sentence'], train_df['label'], tokenizer, max_length=128)
test_dataset = ToxicCommentDataset(test_df['sentence'], test_df['label'], tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [15]:
# Инициализация оптимизатора и функции потерь
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()



In [16]:
# Обучение модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Оценка модели на тестовой выборке
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Вывод метрик на каждой эпохе
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{num_epochs}, Accuracy: {accuracy:.4f}\n{report}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Accuracy: 0.8148
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1944
           1       0.79      0.59      0.67       939

    accuracy                           0.81      2883
   macro avg       0.81      0.76      0.77      2883
weighted avg       0.81      0.81      0.81      2883

Epoch 2/3, Accuracy: 0.8644
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1944
           1       0.80      0.78      0.79       939

    accuracy                           0.86      2883
   macro avg       0.85      0.84      0.84      2883
weighted avg       0.86      0.86      0.86      2883

Epoch 3/3, Accuracy: 0.8668
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1944
           1       0.86      0.70      0.77       939

    accuracy                           0.87      2883
   macro avg       0.87      0.82      0.84 

In [None]:
tokenizer.save_pretrained('comment_classifier')
model.save_pretrained('comment_classifier')

# Проверка

In [None]:
tokenizer = BertTokenizer.from_pretrained('comment_classifier')
model = BertForSequenceClassification.from_pretrained('comment_classifier')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# Пример текста для проверки
sample_text = "Отлично выглядишь"

# Токенизация и подготовка текста для модели
encoded_text = tokenizer(sample_text, return_tensors='pt', truncation=True, padding=True, max_length=128)
input_ids = encoded_text['input_ids'].to(device)
attention_mask = encoded_text['attention_mask'].to(device)

# Предсказание
model.eval()
with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

# Получение предсказания
logits = output.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Вывод результата
if predicted_class == 1:
    print("Текст токсичен.")
else:
    print("Текст не токсичен.")

Текст не токсичен.


In [None]:
df["toxic"].value_counts()

toxic
0.0    9586
1.0    4826
Name: count, dtype: int64