In [None]:
# Unsloth LLM Finetuning for ServiceNow QA Dataset
# 
# This notebook performs finetuning of Qwen2.5-7B on the ServiceNow QA dataset
# using Unsloth for faster training.

import os
import sys
import yaml
import json
import boto3
from pathlib import Path

# Add project root to path to import project modules
project_root = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()
sys.path.append(str(project_root))

# Install required packages
!pip install -q unsloth boto3 peft
!pip install -q --upgrade --no-cache-dir git+https://github.com/unslothai/unsloth.git

# Import project modules
from src.cloud.auth import get_s3_client
from src.cloud.storage import download_from_s3, upload_to_s3

# Load configuration
def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

data_config_path = os.path.join(project_root, 'configs/data/data_processing.yaml')
data_config = load_config(data_config_path)

# Set up S3 connection
s3_bucket = data_config['s3']['default_bucket']
s3_region = data_config['s3']['region']

# Create S3 client
s3_client = get_s3_client(region=s3_region)

# Download the ServiceNow QA dataset from S3
local_data_path = "servicenow-qa_converted.json"
s3_data_path = "data/processed/servicenow-qa_converted.json"

print(f"Downloading dataset from s3://{s3_bucket}/{s3_data_path}")
download_from_s3(s3_client, s3_bucket, s3_data_path, local_data_path)

# Load the ServiceNow QA dataset
with open(local_data_path, 'r') as f:
    dataset = json.load(f)

print(f"Loaded dataset with {len(dataset)} examples")

# Set up Unsloth finetuning
from unsloth import FastLanguageModel
import torch

# Load training config
training_config_path = os.path.join(project_root, 'configs/training/llm_finetuning.yaml')
if os.path.exists(training_config_path):
    training_config = load_config(training_config_path)
else:
    print(f"Warning: Training config not found at {training_config_path}, using defaults")
    training_config = {
        "model": {
            "name": "unsloth/Qwen2.5-7B",
            "max_seq_length": 2048,
            "load_in_4bit": True
        }
    }

# Get model parameters from config
model_name = training_config['model']['name']
max_seq_length = training_config['model']['max_seq_length']
dtype = None  # Auto detection: Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = training_config['model']['load_in_4bit']

# Log configuration being used
print(f"Using model configuration:")
print(f"  - Model: {model_name}")
print(f"  - Max Sequence Length: {max_seq_length}")
print(f"  - Load in 4-bit: {load_in_4bit}")

# Load model
print(f"Loading {model_name} with Unsloth optimization...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,

# Get LoRA parameters from config
lora_config = training_config.get('lora', {
    "r": 16,
    "alpha": 16,
    "dropout": 0,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "use_rslora": False
})

# Log LoRA configuration
print(f"Using LoRA configuration:")
print(f"  - Rank: {lora_config.get('r', 16)}")
print(f"  - Alpha: {lora_config.get('alpha', 16)}")
print(f"  - Dropout: {lora_config.get('dropout', 0)}")
print(f"  - Target modules: {lora_config.get('target_modules')}")
print(f"  - Using RS-LoRA: {lora_config.get('use_rslora', False)}")

# Add LoRA adapters
print("Adding LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_config.get('r', 16),
    target_modules=lora_config.get('target_modules'),
    lora_alpha=lora_config.get('alpha', 16),
    lora_dropout=lora_config.get('dropout', 0),
    bias="none",  # "none" is optimized
    use_gradient_checkpointing="unsloth",  # "unsloth" for very long context
    random_state=3407,
    use_rslora=lora_config.get('use_rslora', False),
    loftq_config=None,
)

# Format the ServiceNow QA dataset for training
from datasets import Dataset

# Format dataset for training
def format_servicenow_qa(examples):
    formatted_examples = []
    
    for example in examples:
        messages = example.get("messages", [])
        
        # Extract system, user, and assistant messages
        system_content = ""
        user_content = ""
        assistant_content = ""
        
        for message in messages:
            if message["role"] == "system":
                system_content = message["content"]
            elif message["role"] == "user":
                user_content = message["content"]
            elif message["role"] == "assistant":
                assistant_content = message["content"]
        
        # Create a formatted prompt using ChatML format
        formatted_text = f"""<|im_start|>system
{system_content}<|im_end|>
<|im_start|>user
{user_content}<|im_end|>
<|im_start|>assistant
{assistant_content}{tokenizer.eos_token}<|im_end|>"""
        
        formatted_examples.append({"text": formatted_text})
    
    return formatted_examples

# Create HF dataset
formatted_data = format_servicenow_qa(dataset)
train_dataset = Dataset.from_list(formatted_data)

print(f"Formatted dataset with {len(train_dataset)} examples")

# Print a sample example to verify formatting
print("\nSample formatted example:")
print(train_dataset[0]["text"][:500] + "...\n")

# Set up TRL SFTTrainer
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Define output directory
output_dir = "finetuned_model"

# Get training parameters from config
train_config = training_config.get('training', {
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 5,
    "num_train_epochs": 3,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "lr_scheduler_type": "linear",
    "seed": 3407,
    "logging_steps": 10,
    "optim": "adamw_8bit",
    "evaluation_strategy": "no",
    "save_strategy": "epoch",
    "save_total_limit": 3
})

# Get output directory from config
output_config = training_config.get('output', {
    "dir": "models/finetuned"
})
output_dir = output_config.get("dir", "finetuned_model")

# Log training configuration
print(f"Using training configuration:")
print(f"  - Batch size: {train_config.get('batch_size', 2)}")
print(f"  - Gradient accumulation steps: {train_config.get('gradient_accumulation_steps', 4)}")
print(f"  - Number of epochs: {train_config.get('num_train_epochs', 3)}")
print(f"  - Learning rate: {train_config.get('learning_rate', 2e-4)}")
print(f"  - Output directory: {output_dir}")

# Create training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=train_config.get('batch_size', 2),
    gradient_accumulation_steps=train_config.get('gradient_accumulation_steps', 4),
    warmup_steps=train_config.get('warmup_steps', 5),
    num_train_epochs=train_config.get('num_train_epochs', 3),
    learning_rate=train_config.get('learning_rate', 2e-4),
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=train_config.get('logging_steps', 10),
    optim=train_config.get('optim', "adamw_8bit"),
    weight_decay=train_config.get('weight_decay', 0.01),
    lr_scheduler_type=train_config.get('lr_scheduler_type', "linear"),
    seed=train_config.get('seed', 3407),
    output_dir=output_dir,
    evaluation_strategy=train_config.get('evaluation_strategy', "no"),
    save_strategy=train_config.get('save_strategy', "epoch"),
    save_total_limit=train_config.get('save_total_limit', 3),
    report_to="none",  # Set to "wandb" if using Weights & Biases
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences
    args=training_args,
)

# Display current memory stats
print("\nTraining Memory Stats:")
total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
allocated_gpu_memory = torch.cuda.memory_allocated() / (1024**3)
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Total GPU Memory: {total_gpu_memory:.3f} GB")
print(f"Currently Allocated: {allocated_gpu_memory:.3f} GB")

# Train the model
print("\nStarting training...")
trainer_stats = trainer.train()

# Show final memory and time stats
training_time_seconds = trainer_stats.metrics.get("train_runtime", 0)
training_time_minutes = training_time_seconds / 60
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)

print(f"\n{training_time_seconds:.4f} seconds used for training.")
print(f"{training_time_minutes:.2f} minutes used for training.")
print(f"Peak allocated memory = {peak_memory_gb:.3f} GB.")
print(f"Peak allocated memory % of max memory = {(peak_memory_gb / total_gpu_memory) * 100:.3f} %.")

# Save the model
print("\nSaving model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Enable inference mode
FastLanguageModel.for_inference(model)

# Test inference
print("\nTesting the finetuned model:")
test_question = "How do I reset my ServiceNow password?"

# Format the test question using the ChatML format
test_prompt = f"""<|im_start|>system
You are a helpful AI assistant.<|im_end|>
<|im_start|>user
{test_question}<|im_end|>
<|im_start|>assistant
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
print("\nModel response:")
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=200)

# Get save configuration from config
s3_config = training_config.get('s3', {
    "bucket": s3_bucket,  # Use the one from data_config if not specified
    "model_prefix": f"models/{model_name.split('/')[-1]}-servicenow-qa"
})
model_s3_path = s3_config.get("model_prefix")

# Get output formats from config
output_formats = output_config.get("save_formats", ["lora", "gguf_q4_k_m"])

print(f"\nSaving model to S3:")
print(f"  - S3 bucket: {s3_bucket}")
print(f"  - S3 path: {model_s3_path}")
print(f"  - Output formats: {output_formats}")

# Save LoRA adapters if specified
if "lora" in output_formats:
    print("\nSaving LoRA adapters...")
    model.save_pretrained_merged(output_dir, tokenizer, save_method="lora")
    upload_to_s3(s3_client, s3_bucket, output_dir, f"{model_s3_path}/lora")

# Save merged model if specified
if "merged_16bit" in output_formats:
    print("\nSaving merged 16-bit model...")
    model.save_pretrained_merged(output_dir, tokenizer, save_method="merged_16bit")
    upload_to_s3(s3_client, s3_bucket, output_dir, f"{model_s3_path}/merged_16bit")

# Save in 4-bit if specified
if "merged_4bit" in output_formats:
    print("\nSaving merged 4-bit model...")
    model.save_pretrained_merged(output_dir, tokenizer, save_method="merged_4bit")
    upload_to_s3(s3_client, s3_bucket, output_dir, f"{model_s3_path}/merged_4bit")

# Save in GGUF format if specified
gguf_formats = [fmt for fmt in output_formats if fmt.startswith("gguf_")]
if gguf_formats:
    print("\nConverting to GGUF format(s)...")
    for gguf_format in gguf_formats:
        # Extract quantization method from format name (e.g., "gguf_q4_k_m" -> "q4_k_m")
        quant_method = gguf_format.replace("gguf_", "")
        print(f"  - Creating GGUF with quantization: {quant_method}")
        model.save_pretrained_gguf(output_dir, tokenizer, quantization_method=quant_method)
        upload_to_s3(
            s3_client, 
            s3_bucket, 
            f"{output_dir}-unsloth-{quant_method.upper()}.gguf", 
            f"{model_s3_path}/gguf/model-{quant_method}.gguf"
        )

print(f"\nModel successfully saved to s3://{s3_bucket}/{model_s3_path}/")
print("Finetuning complete!")