# ShohnomaLLM - Обучение модели

Fine-tuning Qwen2.5-1.5B для генерации таджикских стихов.

**Требования:**
- GPU: T4 (15GB VRAM) или лучше
- Данные в Google Drive (из notebook 01)

**Время обучения:** ~2-4 часа на T4

In [None]:
# Проверка GPU
!nvidia-smi

In [None]:
# Установка Unsloth (оптимизированный fine-tuning)
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes

In [None]:
# Подключение Google Drive
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/ShohnomaLLM"
DATA_DIR = f"{PROJECT_DIR}/data"
MODEL_DIR = f"{PROJECT_DIR}/models"

## 1. Подготовка данных

In [None]:
import json
import random
from pathlib import Path

# Системный промпт
SYSTEM_PROMPT = """Ту шоири тоҷикӣ ҳастӣ. Ту метавонӣ шеърҳои классикӣ (рубоӣ, ғазал, қасида) ва шеърҳои озод бинависӣ."""

# Промпты по формам
PROMPTS = {
    'rubaiyat': ['Рубоӣ бинавис', 'Як рубоӣ эҷод кун', 'Чор мисраъ бинавис'],
    'ghazal': ['Ғазал бинавис', 'Ғазали ошиқона эҷод кун'],
    'qasida': ['Қасида бинавис'],
    'masnavi': ['Маснавӣ бинавис'],
    'other': ['Шеър бинавис', 'Шеъри зебо эҷод кун'],
}

def format_example(poem):
    """Форматирование в ChatML"""
    text = poem.get('text_tajik') or poem.get('text', '')
    form = poem.get('form', 'other')
    prompt = random.choice(PROMPTS.get(form, PROMPTS['other']))
    
    return {
        'text': f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
{text}<|im_end|>"""
    }

# Загрузка и форматирование данных
def load_and_format(input_path):
    examples = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            poem = json.loads(line)
            text = poem.get('text_tajik') or poem.get('text', '')
            if len(text) > 20:
                examples.append(format_example(poem))
    return examples

# Загружаем данные
data_file = f"{DATA_DIR}/raw/ganjoor/all_classical.jsonl"
examples = load_and_format(data_file)

print(f"Загружено примеров: {len(examples)}")
print(f"\nПример:\n{examples[0]['text'][:500]}")

In [None]:
# Разбиение на train/val
random.seed(42)
random.shuffle(examples)

val_size = int(len(examples) * 0.1)
train_examples = examples[val_size:]
val_examples = examples[:val_size]

print(f"Train: {len(train_examples)}")
print(f"Val: {len(val_examples)}")

# Сохраняем
Path(f"{DATA_DIR}/training").mkdir(parents=True, exist_ok=True)

with open(f"{DATA_DIR}/training/train.jsonl", 'w') as f:
    for ex in train_examples:
        json.dump(ex, f, ensure_ascii=False)
        f.write('\n')

with open(f"{DATA_DIR}/training/val.jsonl", 'w') as f:
    for ex in val_examples:
        json.dump(ex, f, ensure_ascii=False)
        f.write('\n')

## 2. Загрузка модели

In [None]:
from unsloth import FastLanguageModel
import torch

# Конфигурация
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_SEQ_LENGTH = 512

# Загрузка модели с 4-bit квантизацией
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect
    load_in_4bit=True,
)

print(f"Модель загружена: {MODEL_NAME}")

In [None]:
# Добавляем LoRA адаптеры
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # LoRA rank
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print("LoRA адаптеры добавлены")
model.print_trainable_parameters()

## 3. Обучение

In [None]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Загрузка датасета
dataset = load_dataset(
    'json',
    data_files={
        'train': f"{DATA_DIR}/training/train.jsonl",
        'validation': f"{DATA_DIR}/training/val.jsonl",
    }
)

print(f"Train: {len(dataset['train'])}")
print(f"Val: {len(dataset['validation'])}")

In [None]:
# Настройка обучения
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=TrainingArguments(
        output_dir="./outputs",
        
        # Batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,
        
        # Learning rate
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        
        # Epochs
        num_train_epochs=3,
        
        # Optimization
        bf16=True,
        optim="adamw_8bit",
        weight_decay=0.01,
        max_grad_norm=1.0,
        
        # Logging
        logging_steps=10,
        eval_steps=100,
        evaluation_strategy="steps",
        
        # Saving
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        
        # Other
        seed=42,
        report_to="none",
    ),
)

In [None]:
# Запуск обучения
print("Начало обучения...")
trainer.train()

## 4. Сохранение модели

In [None]:
# Сохраняем LoRA адаптеры
lora_path = f"{MODEL_DIR}/tajik-poetry-lora"
model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)
print(f"LoRA сохранён: {lora_path}")

In [None]:
# Объединяем LoRA с базовой моделью
merged_path = f"{MODEL_DIR}/tajik-poetry-1.5b"

model.save_pretrained_merged(
    merged_path,
    tokenizer,
    save_method="merged_16bit",
)
print(f"Merged модель сохранена: {merged_path}")

## 5. Тестирование

In [None]:
# Включаем режим inference
FastLanguageModel.for_inference(model)

def generate_poem(prompt, max_tokens=256, temperature=0.8):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
    )
    
    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True,
    )
    
    return response

In [None]:
# Генерация примеров
prompts = [
    "Рубоӣ бинавис",
    "Ғазали ошиқона эҷод кун",
    "Шеър дар бораи баҳор бинавис",
]

for prompt in prompts:
    print(f"\n{'='*50}")
    print(f"Запрос: {prompt}")
    print(f"{'='*50}")
    poem = generate_poem(prompt)
    print(poem)

## 6. Экспорт в GGUF (опционально)

Для запуска на CPU через llama.cpp

In [None]:
# Экспорт в GGUF (4-bit квантизация)
gguf_path = f"{MODEL_DIR}/tajik-poetry-1.5b-q4_k_m.gguf"

model.save_pretrained_gguf(
    gguf_path.replace('.gguf', ''),
    tokenizer,
    quantization_method="q4_k_m",
)
print(f"GGUF сохранён: {gguf_path}")

In [None]:
print("\n" + "="*50)
print("Обучение завершено!")
print("="*50)
print(f"\nМодели сохранены в: {MODEL_DIR}")
print("\nДля использования:")
print("1. Скачайте папку models/ из Google Drive")
print("2. Используйте inference/generator.py")