# Phase 2 - Training with Unsloth + QLoRA (Offline Version)

**Objective**: Fine-tune Qwen2.5-Coder-0.5B-Instruct using QLoRA on FIM dataset.

**Environment**: Offline GPU machine

In [None]:
import torch
import numpy as np

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: No GPU detected!")

In [None]:
# === CONFIG: Update paths as needed ===
MODEL_PATH = '/app/models/Qwen2.5-Coder-0.5B-Instruct'  # Pre-downloaded model
TRAIN_PATH = './split_data/train.jsonl'
VAL_PATH = './split_data/val.jsonl'
OUTPUT_DIR = './outputs'
FINAL_MODEL_DIR = './final_model'

import os
assert os.path.exists(TRAIN_PATH), f"Train file not found: {TRAIN_PATH}"
assert os.path.exists(VAL_PATH), f"Val file not found: {VAL_PATH}"
print("✓ Data files found")

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None
load_in_4bit = True

print(f"Loading model from {MODEL_PATH}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_PATH,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print("✓ Model loaded!")

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)
print("✓ LoRA configured!")

In [None]:
from datasets import load_dataset

print("Loading datasets...")
train_full = load_dataset('json', data_files=TRAIN_PATH, split='train')
val_ds = load_dataset('json', data_files=VAL_PATH, split='train')

# Use all training data
train_ds = train_full
print(f"Train: {len(train_ds):,} samples")
print(f"Val: {len(val_ds):,} samples")

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=50,
    save_strategy="no",
    eval_strategy="no",
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    dataloader_num_workers=4,
    group_by_length=True,
    report_to="none",
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_args,
    packing=True,
)
print("✓ Trainer initialized!")

In [None]:
print("Starting training...")
trainer_stats = trainer.train()
print("✓ Training complete!")

In [None]:
import os
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)
print(f"✓ Model saved to {FINAL_MODEL_DIR}")

# Zip for backup
import shutil
shutil.make_archive('final_model', 'zip', FINAL_MODEL_DIR)
print("✓ Created final_model.zip")