# Supervised Fine-Tuning (SFT) with LoRA/QLoRA

This notebook covers:
- Loading base model and dataset
- Configuring LoRA/QLoRA for efficient training
- Training with Trainer API
- Evaluation and metrics
- Saving and merging adapters

In [None]:
import os
import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import wandb
import matplotlib.pyplot as plt

# Check GPU availability\n",
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Configuration

In [None]:
# Model and dataset configuration\n",
BASE_MODEL = "deepseek-ai/DeepSeek-V3-Base"  # or \"Qwen/Qwen2.5-14B\", \"meta-llama/Llama-3.1-8B\"\n"
DATASET_PATH = "../data/processed/sft_dataset"
OUTPUT_DIR = "../models/sft_lora"

# LoRA configuration\n",
LORA_R = 16  # Rank\n",
LORA_ALPHA = 32  # Scaling factor (typically 2x rank)\n",
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj",
                       "o_proj", "gate_proj", "up_proj", "down_proj"]

# Training hyperparameters\n",
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.01

# Optimization settings\n",
USE_8BIT = False  # Set True for QLoRA (8-bit quantization)\n",
USE_4BIT = True   # Set True for 4-bit quantization\n",
USE_GRADIENT_CHECKPOINTING = True
USE_FP16 = True if not USE_4BIT else False
USE_BF16 = False  # Use BF16 if your GPU supports it (A100, H100)\n",

os.makedirs(OUTPUT_DIR, exist_ok=True)

## Load Dataset

In [None]:
# Load preprocessed dataset\n",
dataset = load_from_disk(DATASET_PATH)
train_dataset = dataset['train']
val_dataset = dataset['validation']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"\nDataset features: {train_dataset.features}")

## Load Base Model and Tokenizer

In [None]:
# Load tokenizer\n",
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

In [None]:
# Load model with quantization if enabled\n",
model_kwargs = {"device_map": "auto"}

if USE_4BIT:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if USE_BF16 else torch.float16,
        bnb_4bit_use_double_quant=True
    )
    model_kwargs["quantization_config"] = bnb_config # type: ignore
    print("Using 4-bit quantization (QLoRA)")

elif USE_8BIT:
    model_kwargs["load_in_8bit"] = True  # type: ignore
    print("Using 8-bit quantization")

# Load model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    **model_kwargs,
    trust_remote_code=True
)

# Prepare model for k-bit training if using quantization\n",
if USE_4BIT or USE_8BIT:
    model = prepare_model_for_kbit_training(model)

print(f"\nModel loaded: {BASE_MODEL}")
print(f"Model parameters: {model.num_parameters() / 1e9:.2f}")

## Configure LoRA

In [None]:
# LoRA configuration\n",
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA to model\n",
model = get_peft_model(model, lora_config)

# Print trainable parameters\n",
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
trainable_percent = 100 * trainable_params / total_params

print("\nLoRA Configuration:")
print("Rank: {LORA_R}")
print("Alpha: {LORA_ALPHA}")
print("Dropout: {LORA_DROPOUT}")
print("Target modules: {LORA_TARGET_MODULES}")
print("\nTrainable parameters: {trainable_params:,} ({trainable_percent:.2f}%)")
print("Total parameters: {total_params:,}")

model.print_trainable_parameters()

## Training Configuration

In [None]:
# Training arguments\n",
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,

    # Optimization
    fp16=USE_FP16,
    bf16=USE_BF16,
    gradient_checkpointing=USE_GRADIENT_CHECKPOINTING,
    optim="paged_adamw_8bit" if USE_4BIT else "adamw_torch",

    # Logging and saving
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,

    # Evaluation\n",
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Other
    report_to="none",  # Change to "wandb" if using Weights & Biases
    remove_unused_columns=False,
    ddp_find_unused_parameters=False if USE_GRADIENT_CHECKPOINTING else None,
)

print(f"\nEffective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Total training steps: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * NUM_EPOCHS}")"


In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, not masked LM
)

# Early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

## Initialize Trainer

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping]
)

print("Trainer initialized successfully")

## Start Training

In [None]:
# Train the model\n",
print("Starting training...\n")
train_result = trainer.train()

# Print training results
print("\n" + "="*50)
print("Training completed!")
print("="*50)
print(f"\nTraining time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training loss: {train_result.metrics['train_loss']:.4f}")
print(f"Training samples/second: {train_result.metrics['train_samples_per_second']:.2f}")

## Evaluation

In [None]:
# Evaluate on validation set
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

## Save Model and Adapters

In [None]:
# Save LoRA adapters\n",
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\nLoRA adapters saved to: {OUTPUT_DIR}")

# Save training metrics\n",
import json
metrics = {
    "train_loss": train_result.metrics['train_loss'],
    "train_runtime": train_result.metrics['train_runtime'],
    "eval_loss": eval_results['eval_loss'],
    "base_model": BASE_MODEL,
    "lora_r": LORA_R,
    "lora_alpha": LORA_ALPHA,
    "learning_rate": LEARNING_RATE,
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
}

with open(os.path.join(OUTPUT_DIR, 'training_metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)

print("Training metrics saved")

## Merge and Save Full Model (Optional)

In [None]:
# Merge LoRA adapters with base model for inference
MERGE_AND_SAVE = False  # Set True to merge and save full model

if MERGE_AND_SAVE:
    print("Merging LoRA adapters with base model...")

    # Merge adapters
    model = model.merge_and_unload()

    # Save merged model
    merged_output_dir = OUTPUT_DIR + "_merged"
    model.save_pretrained(merged_output_dir)
    tokenizer.save_pretrained(merged_output_dir)

    print(f"Merged model saved to: {merged_output_dir}")
else:
    print("Skipping model merging (use LoRA adapters for inference)")

## Quick Inference Test

In [None]:
# Test the fine-tuned model\n",
from transformers import pipeline

# Create text generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test prompts\n",
test_prompts = [
    "Tell me about Elio's first day at the Communiverse.",
    "What makes Glordon such a mysterious character?",
    "Describe the relationship between Elio and Ambassador Questa."
]

print("\nInference Tests:")
print("="*60)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\nTest {i}:")
    print(f"Prompt: {prompt}")
    print(f"Response: ", end="")

    result = generator(
        prompt,
        max_length=200,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )

    print(result[0]['generated_text'][len(prompt):])
    print("-" * 60)