# Supervised Fine-Tuning (SFT) with LoRA/QLoRA - FIXED VERSION

This notebook covers:
- Loading base model and dataset from your datasets folder
- Configuring LoRA/QLoRA for efficient training
- Training with Trainer API
- Evaluation and metrics
- Saving and merging adapters

**Updated to work with your environment and datasets**

In [None]:
import os
import sys
import torch
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import json
from pathlib import Path

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: No GPU detected! Training will be very slow.")

## Configuration

In [None]:
# ========== PATHS ==========
# Shared dataset location
DATASETS_DIR = "/mnt/c/AI_LLM_projects/ai_warehouse/datasets"

# Bot project paths (for Communiverse data)
BOT_DATA_DIR = "/mnt/c/web-projects/elioverse-bot/data"
TRAINING_DATA_DIR = os.path.join(BOT_DATA_DIR, "training-datasets")

# Model paths
BASE_MODEL = "deepseek-ai/deepseek-llm-7b-base"  # Adjust based on what you downloaded
OUTPUT_DIR = "../models/sft_lora_communiverse"

# ========== DATASET SELECTION ==========
# Choose which datasets to use for training
USE_DATASETS = {
    'oasst2': True,          # OpenAssistant high-quality dialogues
    'ultrachat': False,      # Large scale conversations (set False to save time)
    'alpaca': True,          # Instruction following
    'firefly': True,         # Chinese + English mix
    'communiverse': True,    # Your custom Communiverse data
}

# ========== LORA CONFIGURATION ==========
LORA_R = 16  # Rank (8, 16, 32 are common)
LORA_ALPHA = 32  # Scaling factor (typically 2x rank)
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = [
    "q_proj", "v_proj", "k_proj", "o_proj",  # Attention
    "gate_proj", "up_proj", "down_proj"       # FFN
]

# ========== TRAINING HYPERPARAMETERS ==========
BATCH_SIZE = 2                    # Per device batch size
GRADIENT_ACCUMULATION_STEPS = 8   # Effective batch = 2 * 8 = 16
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0

# ========== OPTIMIZATION SETTINGS ==========
USE_4BIT = True          # QLoRA (recommended for 24GB VRAM)
USE_8BIT = False         # Alternative for larger VRAM
USE_GRADIENT_CHECKPOINTING = True
USE_FP16 = not USE_4BIT  # Auto-disable if using 4-bit
USE_BF16 = False         # Use if you have A100/H100

# ========== DATA LIMITS (for testing) ==========
MAX_TRAIN_SAMPLES = None  # Set to small number (e.g., 1000) for testing
MAX_VAL_SAMPLES = None    # Set to small number (e.g., 100) for testing

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

## Helper Functions for Data Loading

In [None]:
def load_oasst2_dataset():
    """Load OpenAssistant OASST2 dataset"""
    print("Loading OASST2...")
    dataset_path = os.path.join(DATASETS_DIR, "OpenAssistant___oasst2")
    
    if os.path.exists(dataset_path):
        dataset = load_from_disk(dataset_path)
        # OASST2 has train/validation splits
        return dataset
    else:
        print(f"  Downloading from HuggingFace...")
        dataset = load_dataset("OpenAssistant/oasst2")
        return dataset

def load_alpaca_dataset():
    """Load Alpaca dataset"""
    print("Loading Alpaca...")
    dataset_path = os.path.join(DATASETS_DIR, "tatsu-lab___alpaca")
    
    if os.path.exists(dataset_path):
        dataset = load_from_disk(dataset_path)
        return dataset
    else:
        print(f"  Downloading from HuggingFace...")
        dataset = load_dataset("tatsu-lab/alpaca")
        return dataset

def load_firefly_dataset():
    """Load Firefly Chinese dataset"""
    print("Loading Firefly...")
    dataset_path = os.path.join(DATASETS_DIR, "YeungNLP___firefly-train-1.1_m")
    
    if os.path.exists(dataset_path):
        dataset = load_from_disk(dataset_path)
        return dataset
    else:
        print(f"  Downloading from HuggingFace...")
        dataset = load_dataset("YeungNLP/firefly-train-1.1_m")
        return dataset

def load_communiverse_dataset():
    """Load your custom Communiverse training data"""
    print("Loading Communiverse custom data...")
    
    # Try to load from training-datasets folder
    jsonl_path = os.path.join(TRAINING_DATA_DIR, "communiverse_training.jsonl")
    json_path = os.path.join(TRAINING_DATA_DIR, "communiverse_training.json")
    
    if os.path.exists(jsonl_path):
        dataset = load_dataset('json', data_files=jsonl_path)
        return dataset['train']  # JSONL format returns 'train' split
    elif os.path.exists(json_path):
        dataset = load_dataset('json', data_files=json_path)
        return dataset['train']
    else:
        print(f"  WARNING: Communiverse data not found!")
        print(f"  Expected at: {jsonl_path}")
        print(f"  Run: node scripts/generate-training-data.js first!")
        return None

def format_instruction_dataset(example, dataset_name=""):
    """
    Convert different dataset formats to a unified format.
    Returns: {"text": "<formatted instruction-response pair>"}
    """
    
    # Alpaca format
    if 'instruction' in example and 'output' in example:
        instruction = example['instruction']
        input_text = example.get('input', '')
        output = example['output']
        
        if input_text:
            text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        else:
            text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    
    # OASST2 format (conversation trees)
    elif 'text' in example and dataset_name == 'oasst2':
        # OASST2 already has formatted conversations
        text = example['text']
    
    # Firefly format
    elif 'input' in example and 'target' in example:
        text = f"### Instruction:\n{example['input']}\n\n### Response:\n{example['target']}"
    
    # Communiverse format (from generate-training-data.js)
    elif 'persona' in example and 'dialogue' in example:
        persona = example['persona']
        dialogue = example['dialogue']
        text = f"### Character: {persona}\n\n{dialogue}"
    
    # Already formatted
    elif 'text' in example:
        text = example['text']
    
    else:
        # Fallback: combine all fields
        text = str(example)
    
    return {"text": text}

print("Helper functions defined")

## Load and Combine Datasets

In [None]:
# Load selected datasets
all_datasets = []
dataset_stats = {}

if USE_DATASETS['oasst2']:
    try:
        ds = load_oasst2_dataset()
        # Take train split
        train_ds = ds['train'].map(lambda x: format_instruction_dataset(x, 'oasst2'))
        all_datasets.append(train_ds)
        dataset_stats['oasst2'] = len(train_ds)
        print(f"  âœ“ OASST2: {len(train_ds):,} samples")
    except Exception as e:
        print(f"  âœ— Failed to load OASST2: {e}")

if USE_DATASETS['alpaca']:
    try:
        ds = load_alpaca_dataset()
        train_ds = ds['train'].map(lambda x: format_instruction_dataset(x, 'alpaca'))
        all_datasets.append(train_ds)
        dataset_stats['alpaca'] = len(train_ds)
        print(f"  âœ“ Alpaca: {len(train_ds):,} samples")
    except Exception as e:
        print(f"  âœ— Failed to load Alpaca: {e}")

if USE_DATASETS['firefly']:
    try:
        ds = load_firefly_dataset()
        train_ds = ds['train'].map(lambda x: format_instruction_dataset(x, 'firefly'))
        # Firefly is large, take subset
        if len(train_ds) > 50000:
            train_ds = train_ds.select(range(50000))
        all_datasets.append(train_ds)
        dataset_stats['firefly'] = len(train_ds)
        print(f"  âœ“ Firefly: {len(train_ds):,} samples")
    except Exception as e:
        print(f"  âœ— Failed to load Firefly: {e}")

if USE_DATASETS['communiverse']:
    try:
        ds = load_communiverse_dataset()
        if ds is not None:
            train_ds = ds.map(lambda x: format_instruction_dataset(x, 'communiverse'))
            all_datasets.append(train_ds)
            dataset_stats['communiverse'] = len(train_ds)
            print(f"  âœ“ Communiverse: {len(train_ds):,} samples")
    except Exception as e:
        print(f"  âœ— Failed to load Communiverse: {e}")

# Combine all datasets
if len(all_datasets) == 0:
    raise ValueError("No datasets loaded! Please check your configuration.")

print(f"\nCombining {len(all_datasets)} datasets...")
combined_dataset = concatenate_datasets(all_datasets)

# Shuffle
combined_dataset = combined_dataset.shuffle(seed=42)

# Split into train/validation
split_dataset = combined_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

# Apply limits if testing
if MAX_TRAIN_SAMPLES:
    train_dataset = train_dataset.select(range(min(MAX_TRAIN_SAMPLES, len(train_dataset))))
if MAX_VAL_SAMPLES:
    val_dataset = val_dataset.select(range(min(MAX_VAL_SAMPLES, len(val_dataset))))

print(f"\n{'='*60}")
print("DATASET SUMMARY")
print(f"{'='*60}")
for name, count in dataset_stats.items():
    print(f"{name:20s}: {count:>10,} samples")
print(f"{'='*60}")
print(f"{'Total':20s}: {len(combined_dataset):>10,} samples")
print(f"{'Training':20s}: {len(train_dataset):>10,} samples (95%)")
print(f"{'Validation':20s}: {len(val_dataset):>10,} samples (5%)")
print(f"{'='*60}")

# Show example
print("\nExample training sample:")
print(train_dataset[0]['text'][:500] + "..." if len(train_dataset[0]['text']) > 500 else train_dataset[0]['text'])

## Load Tokenizer and Tokenize Dataset

In [None]:
# Load tokenizer
print(f"Loading tokenizer from {BASE_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer vocab size: {len(tokenizer):,}")
print(f"Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

# Tokenization function
def tokenize_function(examples):
    """Tokenize text and prepare for causal language modeling"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding=False,  # Will be done by data collator
        return_tensors=None
    )
    
    # For causal LM, labels = input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    
    return tokenized

# Tokenize datasets
print("\nTokenizing datasets... (this may take a while)")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing training set"
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation set"
)

print(f"\nTokenization complete!")
print(f"Training samples: {len(tokenized_train):,}")
print(f"Validation samples: {len(tokenized_val):,}")

## Load Base Model with Quantization

In [None]:
print(f"Loading model: {BASE_MODEL}")
print(f"Quantization: {'4-bit (QLoRA)' if USE_4BIT else '8-bit' if USE_8BIT else 'None (Full precision)'}")

model_kwargs = {"device_map": "auto", "trust_remote_code": True}

if USE_4BIT:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if USE_BF16 else torch.float16,
        bnb_4bit_use_double_quant=True
    )
    model_kwargs["quantization_config"] = bnb_config

elif USE_8BIT:
    model_kwargs["load_in_8bit"] = True

# Load model
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, **model_kwargs)

# Prepare for k-bit training if quantized
if USE_4BIT or USE_8BIT:
    model = prepare_model_for_kbit_training(model)
    print("Model prepared for quantized training")

print(f"Model loaded successfully")
print(f"Total parameters: {model.num_parameters() / 1e9:.2f}B")

## Configure and Apply LoRA

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Calculate trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
trainable_percent = 100 * trainable_params / total_params

print(f"\n{'='*60}")
print("LORA CONFIGURATION")
print(f"{'='*60}")
print(f"Rank (r):           {LORA_R}")
print(f"Alpha:              {LORA_ALPHA}")
print(f"Dropout:            {LORA_DROPOUT}")
print(f"Target modules:     {', '.join(LORA_TARGET_MODULES)}")
print(f"\nTrainable params:   {trainable_params:,} ({trainable_percent:.2f}%)")
print(f"Total params:       {total_params:,}")
print(f"{'='*60}")

model.print_trainable_parameters()

## Training Configuration

In [None]:
# Calculate total steps
total_steps = (len(tokenized_train) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)) * NUM_EPOCHS

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    
    # Learning rate
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    
    # Regularization
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=MAX_GRAD_NORM,
    
    # Optimization
    fp16=USE_FP16,
    bf16=USE_BF16,
    gradient_checkpointing=USE_GRADIENT_CHECKPOINTING,
    optim="paged_adamw_8bit" if USE_4BIT else "adamw_torch",
    
    # Logging
    logging_steps=50,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    
    # Saving
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Other
    report_to="none",
    remove_unused_columns=False,
    ddp_find_unused_parameters=False if USE_GRADIENT_CHECKPOINTING else None,
    push_to_hub=False,
)

print(f"\n{'='*60}")
print("TRAINING CONFIGURATION")
print(f"{'='*60}")
print(f"Epochs:             {NUM_EPOCHS}")
print(f"Batch size:         {BATCH_SIZE} x {GRADIENT_ACCUMULATION_STEPS} = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Learning rate:      {LEARNING_RATE}")
print(f"Total steps:        {total_steps:,}")
print(f"Warmup steps:       {int(total_steps * WARMUP_RATIO):,}")
print(f"Max seq length:     {MAX_SEQ_LENGTH}")
print(f"Precision:          {'BF16' if USE_BF16 else 'FP16' if USE_FP16 else 'FP32'}")
print(f"Gradient checkpoint: {USE_GRADIENT_CHECKPOINTING}")
print(f"{'='*60}")

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM
)

# Early stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

print("Data collator and callbacks configured")

## Initialize Trainer and Start Training

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[early_stopping]
)

print("Trainer initialized successfully")
print(f"\nðŸš€ Starting training...")
print(f"This will take approximately {total_steps * 2 / 3600:.1f} hours (estimate)\n")

In [None]:
# Train!
train_result = trainer.train()

# Print results
print(f"\n{'='*60}")
print("TRAINING COMPLETED")
print(f"{'='*60}")
print(f"Training time:      {train_result.metrics['train_runtime'] / 3600:.2f} hours")
print(f"Training loss:      {train_result.metrics['train_loss']:.4f}")
print(f"Samples/second:     {train_result.metrics['train_samples_per_second']:.2f}")
print(f"{'='*60}")

## Evaluation

In [None]:
# Final evaluation
eval_results = trainer.evaluate()

print(f"\n{'='*60}")
print("EVALUATION RESULTS")
print(f"{'='*60}")
for key, value in eval_results.items():
    print(f"{key:25s}: {value:.4f}")
print(f"{'='*60}")

## Save Model and Metadata

In [None]:
# Save LoRA adapters
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"âœ“ LoRA adapters saved to: {OUTPUT_DIR}")

# Save training metadata
metadata = {
    "base_model": BASE_MODEL,
    "datasets": dataset_stats,
    "total_samples": len(train_dataset) + len(val_dataset),
    "training_samples": len(train_dataset),
    "validation_samples": len(val_dataset),
    "lora_config": {
        "r": LORA_R,
        "alpha": LORA_ALPHA,
        "dropout": LORA_DROPOUT,
        "target_modules": LORA_TARGET_MODULES
    },
    "training_config": {
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "batch_size": BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
        "max_seq_length": MAX_SEQ_LENGTH,
        "quantization": "4-bit" if USE_4BIT else "8-bit" if USE_8BIT else "none"
    },
    "results": {
        "train_loss": float(train_result.metrics['train_loss']),
        "eval_loss": float(eval_results['eval_loss']),
        "train_runtime_hours": float(train_result.metrics['train_runtime'] / 3600)
    }
}

with open(os.path.join(OUTPUT_DIR, 'training_metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"âœ“ Training metadata saved")
print(f"\nAll outputs saved to: {OUTPUT_DIR}")

## Quick Inference Test

In [None]:
# Test the model
print("Testing fine-tuned model...\n")

test_prompts = [
    "### Instruction:\nDescribe Elio's personality.\n\n### Response:\n",
    "### Instruction:\nWhat is the Communiverse?\n\n### Response:\n",
    "### Character: Glordon\n\nHuman: What do you think about potatoes?\n\nGlordon:"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"{'='*60}")
    print(f"Test {i}/{len(test_prompts)}")
    print(f"{'='*60}")
    print(f"PROMPT:\n{prompt}")
    print(f"\nRESPONSE:")
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from output
    response = response[len(prompt):].strip()
    print(response)
    print()

print(f"\nâœ… Training and testing complete!")