In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader, random_split
# from unsloth import FastLanguageModel
import matplotlib.pyplot as plt
from tqdm import tqdm
# import bitsandbytes as bnb

In [2]:


# Конфигурация
MAX_SEQ_LENGTH = 1024 # Ограничиваем длину, чтобы сэкономить VRAM T4
DTYPE = None # Auto detection
LOAD_IN_4BIT = True # [cite: 92] Обязательно для T4

class ABSADataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Загрузка данных [cite: 48]
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(json.loads(line))

        # Шаблон промпта (Alpaca format из твоих данных) [cite: 51]
        self.prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # 1. Формируем полный текст (Prompt + Answer)
        full_prompt = self.prompt_template.format(
            instruction=item['instruction'],
            input=item['input']
        )
        full_text = full_prompt + item['output'] + self.tokenizer.eos_token

        # 2. Токенизация полного текста
        encoded = self.tokenizer(
            full_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"][0]
        attention_mask = encoded["attention_mask"][0]

        # 3. Создаем Labels (Копируем input_ids)
        labels = input_ids.clone()

        # 4. ВАЖНО: Маскируем часть промпта (Instruction + Input), чтобы не считать по ней Loss
        # Токенизируем только промпт, чтобы узнать его длину
        prompt_tokens = self.tokenizer(
            full_prompt,
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )["input_ids"][0]

        # Длина промпта (без спецтокенов начала, если есть)
        prompt_len = prompt_tokens.shape[0]

        # Заменяем токены промпта и паддинги на -100 (PyTorch игнорирует -100 при расчете Loss)
        labels[:prompt_len] = -100
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [None]:
dataset = ABSADataset("../notebooks/train_dataset.jsonl", tokenizer, MAX_SEQ_LENGTH)

In [None]:
# 1. Загрузка модели и токенизатора
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit", # [cite: 79]
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# Настройка паддинга (Llama-3 не имеет pad token по умолчанию)
tokenizer.pad_token = tokenizer.eos_token

# 2. Навешиваем LoRA адаптеры [cite: 93]
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True, # Экономит VRAM
)

# Переводим в режим обучения
model.print_trainable_parameters()

In [None]:
# Гиперпараметры
BATCH_SIZE = 2      # Маленький физический батч для T4
GRAD_ACCUM_STEPS = 4 # Виртуальный батч = 2 * 4 = 8
EPOCHS = 3
LEARNING_RATE = 2e-4

# Подготовка данных
dataset = ABSADataset("train_dataset.jsonl", tokenizer, MAX_SEQ_LENGTH) # [cite: 90]

# Разделение на Train/Val (90/10)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Оптимизатор: используем 8-битный AdamW для экономии памяти
optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=LEARNING_RATE)

# Логгирование
history = {'train_loss': [], 'val_loss': []}

print("Начинаем обучение...")

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]")

    # --- TRAINING PHASE ---
    for step, batch in enumerate(progress_bar):
        # Перенос на GPU
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")
        labels = batch['labels'].to("cuda")

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / GRAD_ACCUM_STEPS # Нормализация лосса

        # Backward pass
        loss.backward()

        if (step + 1) % GRAD_ACCUM_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_train_loss += loss.item() * GRAD_ACCUM_STEPS
        progress_bar.set_postfix({'loss': loss.item() * GRAD_ACCUM_STEPS})

    avg_train_loss = total_train_loss / len(train_loader)
    history['train_loss'].append(avg_train_loss)

    # --- VALIDATION PHASE ---
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['labels'].to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    history['val_loss'].append(avg_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

# Сохранение адаптеров [cite: 95]
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
print("Обучение завершено и адаптеры сохранены.")

In [None]:
def plot_training_history(history):
    epochs = range(1, len(history['train_loss']) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, history['train_loss'], 'b-o', label='Training Loss')
    plt.plot(epochs, history['val_loss'], 'r-o', label='Validation Loss')

    plt.title('Динамика обучения (CrossEntropy Loss)')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_training_history(history)