In [1]:
import pandas as pd
import os
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

# Дополнительные библиотеки (по желанию)
from sklearn.model_selection import train_test_split  # Для разделения данных на обучающую и тестовую выборки
import matplotlib.pyplot as plt  # Для визуализации
import seaborn as sns  # Для более красивой визуализации (по желанию)

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, Trainer, TrainingArguments, BertForMaskedLM
from datasets import Dataset as HFDataset

from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding





In [2]:
# Указываем абсолютный путь к папке с моделью
model_path = os.path.abspath("model")

# Загружаем токенайзер и модель
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForMaskedLM.from_pretrained(model_path)

# Создаем класс для нашего датасета
class TextDataset(Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.inputs = inputs  # Тексты без запятых (входные данные)
        self.outputs = outputs  # Тексты с запятыми (выходные данные)
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs.iloc[idx]
        output_text = self.outputs.iloc[idx]

        inputs = self.tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        outputs = self.tokenizer(output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)

        labels = outputs['input_ids'].squeeze(0)

        # Удаляем лишнюю размерность
        inputs['input_ids'] = inputs['input_ids'].squeeze(0)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(0)

        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': labels
        }

# Загружаем данные из CSV файлов
labeled_dataset = pd.read_csv('labeled_dataset.csv', sep=',')

# Сбрасываем индексы DataFrame, если это необходимо
labeled_dataset = labeled_dataset.reset_index(drop=True)

# Разделение данных на train, validation и test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    labeled_dataset['input'],
    labeled_dataset['output'],
    test_size=0.2,  # 20% на валидацию и тест
    random_state=42
)

# Разделяем временные данные на валидационные и тестовые
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,  # 10% на тест
    random_state=42
)

# Создаем датасеты
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

# Пример получения данных из датасета
example = train_dataset[0]
print("Input IDs (Train):", example['input_ids'])
print("Attention Mask (Train):", example['attention_mask'])
print("Labels (Train):", example['labels'])

# Создание экземпляра DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Input IDs (Train): tensor([    1,   349,    12,    21,    13,  6860,    12,    22,    13,  1118,
           12,    23,    13,  2511,   266,    12,    24,    13,   488,    12,
           25,    13, 17753,    12,    26,    13,   843,    12,    27,    13,
         7747,   275,    12,    28,    13, 32460,    12,    29,    13, 25311,
        29758,    12,  2878,    13,  2869,    12,  4290,    13,   324,    12,
         3189,    13, 24029,    12,  3784,    13, 37199,   748,    12,  3448,
           13, 28368,    12,  2975,    13,   488,    12,  3535,    13,   470,
           12,  3192,    13, 42822,    12,  2504,    13,  2444,    12,  1800,
           13,  1561,  4486,  1381,    18,    12,  2222,    13,     2,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [3]:
import os
import pandas as pd
import optuna
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

# В измененной функции train_model мы зададим меньшее количество эпох и больший размер батча
def train_model(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 32, 128)  # Увеличьте размер батча
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)  # Уменьшите количество эпох

    model = RobertaForMaskedLM.from_pretrained(model_path)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        fp16=True,
        gradient_accumulation_steps=1,  # Уменьшите градиентное накопление
        learning_rate=learning_rate
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
    )

    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_loss"]


# Создаем объект исследования Optuna
study = optuna.create_study(direction="minimize")  # Минимизируем потери
study.optimize(train_model, n_trials=20)  # Укажите количество испытаний

# Выводим лучшие гиперпараметры
print("Best hyperparameters: ", study.best_params)

[I 2024-10-23 15:27:23,780] A new study created in memory with name: no-name-d4cb5c7d-2d48-4a62-92ec-435f89473d4b


  0%|          | 0/455 [00:00<?, ?it/s]

[W 2024-10-23 15:33:17,742] Trial 0 failed with parameters: {'learning_rate': 0.00044938747575382244, 'per_device_train_batch_size': 119, 'num_train_epochs': 1} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Redmi\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Redmi\AppData\Local\Temp\ipykernel_4352\3324051158.py", line 40, in train_model
    trainer.train()
  File "c:\Users\Redmi\anaconda3\Lib\site-packages\transformers\trainer.py", line 2052, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Redmi\anaconda3\Lib\site-packages\transformers\trainer.py", line 2388, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Redmi\anaconda3\Lib\site-packages\transformers\trainer.py", line 34

KeyboardInterrupt: 

In [4]:
import pandas as pd
print(pd.__version__)

import torch
print(torch.__version__)

import transformers
print(transformers.__version__)


1.5.3
2.3.0+cpu
4.45.2
