In [159]:
import pandas as pd
import numpy as np
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import classification_report, f1_score

# 1. Загрузка модели и токенизатора RuBERT

model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=6).to("cuda")
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [160]:
from sklearn.model_selection import train_test_split
import os

# 2. Функция для разбиения текста на чанки по 512 токенов
def split_to_chunks(text, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
    return chunks

# 3. Загрузка данных и разбиение на чанки
train_path = "authors"  # Путь к файлам с авторами
texts, labels = [], []

# Чтение файлов и разделение на чанки
for filename in os.listdir(train_path):
    if filename.endswith(".txt"):
        author = filename.replace(".txt", "")
        with open(os.path.join(train_path, filename), "r", encoding="utf-8") as f:
            text = f.read().strip()
        chunks = split_to_chunks(text, tokenizer)
        for chunk in chunks:
            texts.append(chunk)
            labels.append(author)

# Преобразование данных в DataFrame
train_data = pd.DataFrame({"text": texts, "author": labels})

# 4. Преобразуем текст в формат для модели
train_data['labels'] = train_data['author'].factorize()[0]  # Преобразуем метки авторов в числа
train_data = train_data.drop('author', axis=1)



# Разбиение на train/test
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['text'], train_data['labels'], test_size=0.1)

# Преобразование в формат Hugging Face Datasets
train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "labels": val_labels})

# 5. Токенизация
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4896 [00:00<?, ? examples/s]

Map:   0%|          | 0/545 [00:00<?, ? examples/s]

In [None]:
# 6. Определение аргументов для обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_num_workers=2,
    fp16=True
)

# 7. Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 8. Обучение модели
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1805,0.286812


In [None]:
def predict_author(text, tokenizer, model):
    chunks = split_to_chunks(text, tokenizer)
    predictions = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = inputs.to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).item()
            predictions.append(pred)
    final_pred = max(set(predictions), key=predictions.count)  # голосование
    return final_pred

# Применяем модель к файлам без меток авторов
test_path = "train"  # Путь к файлам без авторов
results = []

for filename in os.listdir(test_path):
    if filename.endswith(".txt"):
        with open(os.path.join(test_path, filename), "r", encoding="utf-8") as f:
            text = f.read().strip()
        predicted_label = predict_author(text, tokenizer, model)
        results.append({"filename": filename, "predicted_author": predicted_label})

# Записываем результаты в CSV
results_df = pd.DataFrame(results)
results_df.to_csv("predictions.csv", index=False)

tensor([[-1.3721,  7.8867, -1.9844, -2.1152, -1.7207, -1.9756]],
       device='cuda:0')
tensor([[-1.4316,  7.8828, -2.0117, -2.0781, -1.7285, -1.9150]],
       device='cuda:0')
tensor([[ 1.1855, -3.1348, -0.1514,  7.1953, -1.4062, -2.8750]],
       device='cuda:0')
tensor([[-1.1885,  7.8906, -1.8213, -2.1699, -1.7549, -2.1641]],
       device='cuda:0')
tensor([[ 0.3420,  7.6133, -2.3789, -2.3027, -1.6885, -2.6211]],
       device='cuda:0')
tensor([[-1.8291, -1.7988, -1.4121, -2.1406,  8.7812, -1.9473]],
       device='cuda:0')
tensor([[-1.8154, -1.9219, -1.4590, -2.0781,  8.7578, -1.8330]],
       device='cuda:0')
tensor([[-1.8359, -1.7979, -1.4561, -2.1777,  8.7734, -1.8516]],
       device='cuda:0')
tensor([[-1.8438, -1.7920, -1.4160, -2.1641,  8.7734, -1.9102]],
       device='cuda:0')
tensor([[-1.7812, -1.8184, -1.4092, -2.1504,  8.7891, -1.9561]],
       device='cuda:0')
tensor([[-1.8262, -1.8154, -1.4658, -2.1699,  8.7656, -1.8428]],
       device='cuda:0')
tensor([[-1.9014, -1.

In [158]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# === 1. Загрузка данных ===

pred_df = pd.read_csv("predicted_authors_2.csv")
true_df = pd.read_csv("corrects.csv")  # Твой файл с filename и author

# Объединим по имени файла
merged = pd.merge(pred_df, true_df, on='filename')
merged.columns = ['filename', 'predicted_author', 'true_author']

y_true = merged['true_author']
y_pred = merged['predicted_author']

# === 2. Метрики ===

print("🔎 Accuracy:", accuracy_score(y_true, y_pred))
print("🔎 F1-score (macro):", f1_score(y_true, y_pred, average='macro'))

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))

# === 3. Матрица ошибок ===

labels = sorted(y_true.unique())

cm = confusion_matrix(y_true, y_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Матрица ошибок")
plt.tight_layout()
plt.show()

# === 4. Бар-чарты для F1-score по каждому классу ===

report = classification_report(y_true, y_pred, output_dict=True)
f1_scores = {label: report[label]['f1-score'] for label in labels}

plt.figure(figsize=(10, 6))
sns.barplot(x=list(f1_scores.keys()), y=list(f1_scores.values()), palette='viridis')
plt.ylabel('F1-score')
plt.title('F1-score по авторам')
plt.ylim(0, 1.05)
plt.tight_layout()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'predicted_authors_2.csv'

In [None]:
print(train_data['labels'])

0       0
1       0
2       0
3       0
4       0
       ..
5436    5
5437    5
5438    5
5439    5
5440    5
Name: labels, Length: 5441, dtype: int64


-----------------------
second version
-----------------------

In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# 1. Подготовка данных

# Папки с текстами
authors_dir = "authors"
test_dir = "train"

# Считываем данные из файлов в папке 'authors'
author_files = os.listdir(authors_dir)
author_texts = []
labels = []

for idx, author_file in enumerate(author_files):
    with open(os.path.join(authors_dir, author_file), "r", encoding="utf-8") as f:
        text = f.read()
        author_texts.append(text)
        labels.append(author_file.split(".")[0])  # Фамилия автора

# Создаем DataFrame для обучения
train_data = pd.DataFrame({
    "text": author_texts,
    "label": labels
})

In [None]:
# Считываем данные из файлов в папке 'test'
test_files = os.listdir(test_dir)
test_texts = []

for test_file in test_files:
    with open(os.path.join(test_dir, test_file), "r", encoding="utf-8") as f:
        text = f.read()
        test_texts.append(text)

In [None]:
# 2. Преобразуем в формат, понятный модели
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

# Токенизируем текст
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=5000)

# Преобразуем данные в формат, который использует Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# 3. Преобразуем метки в числа (для классификации)
label_map = {label: idx for idx, label in enumerate(train_data['label'].unique())}
train_dataset = train_dataset.map(lambda x: {'label': label_map[str(x['label'])]}, batched=True)

# 4. Создаем модель
model = BertForSequenceClassification.from_pretrained('ruBert-base-cased', num_labels=len(label_map)).to("cuda")

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

KeyError: "['Genri', 'Simak', 'Strugatskie', 'Bulgakov', 'Fry', 'Bradbury']"