In [16]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import os


# Проверка доступности CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Загрузка данных
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')
df_test.drop(columns=['Unnamed: 0'], inplace=True)

# Токенизация
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Преобразование классов
class_values = {cls: index for index, cls in enumerate(df_train['class'].unique())}
df_train['class_num'] = df_train['class'].map(class_values)

# Класс для создания датасета
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

# Создание датасета
tokenized_data = tokenizer(df_train['text'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')
dataset = TextDataset(tokenized_data, df_train['class_num'].tolist())

# Разделение данных
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Вычисление весов классов
class_weights = compute_class_weight('balanced', classes=np.unique(df_train['class_num']), y=df_train['class_num'])
class_weights_dict = {i: torch.tensor(class_weights[i]).to(device) for i in range(len(class_values))}

# Определение модели с кастомной функцией потерь
def model_init():
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(class_values))
    
    # Переопределение функции потерь модели
    def custom_loss(logits, labels):
        loss_fct = CrossEntropyLoss(weight=torch.tensor(list(class_weights_dict.values())).to(device))
        return loss_fct(logits.view(-1, model.num_labels), labels.view(-1))

    model.loss_fct = custom_loss
    return model.to(device)

# Определение функции вычисления метрик
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}


# Создание директории для логирования, если она не существует
log_dir = './logs'

# Проверяем, не существует ли файла с таким же именем
if os.path.exists(log_dir) and not os.path.isdir(log_dir):
    raise Exception(f"A file named 'logs' exists in the current directory. Please remove or rename this file.")

# Создаем директорию, если она еще не существует
if not os.path.exists(log_dir):
    os.makedirs(log_dir, exist_ok=True)

# Тренировочные аргументы с обновленной директорией для логирования
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_dir=log_dir,  # Обновленный путь
    load_best_model_at_end=True,
    save_strategy='epoch',
)

# Тренер
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Обучение
trainer.train()

# Оценка
trainer.evaluate()

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FailedPreconditionError: ./logs is not a directory