In [1]:
%pip install accelerate -U
%pip install datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
dataset = pd.read_excel('dataset.xlsx', index_col=0)

In [4]:
import re

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Удаление ссылок
    text = re.sub(r'<.*?>', '', text)    # Удаление HTML-тегов
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)  # Удаление специальных символов
    return text.strip()
dataset['model_answer'] = dataset['model_answer'].apply(clean_text)
dataset['web_results'] = dataset['web_results'].apply(clean_text)

In [5]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(dataset, test_size=0.3, random_state=42)
test_dataset, val_dataset = train_test_split(dataset, test_size=0.5, random_state=42)
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Alternate-Tokenizer")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_and_prepare(examples):
    inputs = tokenizer(examples['question'], truncation=True, padding='max_length', max_length=512)
    targets = tokenizer(examples['model_answer'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = targets['input_ids']
    return inputs
train_dataset = train_dataset.map(tokenize_and_prepare, batched=True)
test_dataset = test_dataset.map(tokenize_and_prepare, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments
from transformers import Trainer, AutoModelForCausalLM
import torch
training_args = TrainingArguments(
    output_dir='./finetuned-llama',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    eval_strategy='steps',
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)
MODEL_NAME = "IlyaGusev/saiga_llama3_8b"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
from transformers import TrainerCallback
class LoggingCallback(TrainerCallback):
    def __init__(self):
        self.logs = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            self.logs.append(logs)

logging_callback = LoggingCallback()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[logging_callback],
)

# Запуск обучения
trainer.train()


In [None]:
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

trainer.evaluate()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

logs_df = pd.DataFrame(logging_callback.logs)

epochs = logs_df['epoch']
eval_loss = logs_df['eval_loss']

plt.figure(figsize=(10, 6))
plt.plot(epochs, eval_loss, marker='o', linestyle='-', color='b', label='Eval Loss')

plt.title('Evaluation Loss vs. Epoch')
plt.xlabel('Epoch')
plt.ylabel('Eval Loss')
plt.legend()
plt.grid(True)
plt.show()