In [None]:
import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import os
from datetime import datetime

# Configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "./lora_adapter"
DATA_FILE = "./data/cli_qa.json"
LOG_DIR = "./logs"

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

print("🚀 Starting TinyLlama fine-tuning...")
print(f"📅 Start time: {datetime.now()}")

# Step 1: Load and prepare data
print("\n📊 Loading Q&A data...")
with open(DATA_FILE, 'r') as f:
    qa_data = json.load(f)

print(f"📈 Loaded {len(qa_data)} Q&A pairs")

# Step 2: Format data for training
def format_prompt(question, answer):
    """Format Q&A pair as a chat prompt for TinyLlama"""
    return f"<|system|>\nYou are a helpful command-line assistant.\n<|user|>\n{question}\n<|assistant|>\n{answer}<|end|>"

formatted_data = []
for item in qa_data:
    formatted_text = format_prompt(item['question'], item['answer'])
    formatted_data.append({"text": formatted_text})

print(f"✅ Formatted {len(formatted_data)} training examples")

# Step 3: Load tokenizer and model
print("\n🤖 Loading TinyLlama model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"📊 Model loaded. Parameters: {model.num_parameters():,}")

# Step 4: Configure LoRA
print("\n⚙ Configuring LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,                    # Low rank
    lora_alpha=32,          # Scaling factor
    lora_dropout=0.1,       # Dropout
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Target attention layers
    bias="none"
)

model = get_peft_model(model, lora_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"🎯 Trainable parameters: {trainable_params:,} ({trainable_params/model.num_parameters()*100:.2f}%)")

# Step 5: Prepare dataset
def tokenize_function(examples):
    """Tokenize the text data"""
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None
    )
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

# Create dataset
dataset = Dataset.from_list(formatted_data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(f"🔢 Dataset size: {len(tokenized_dataset)}")

# Step 6: Training configuration
print("\n🏋 Setting up training...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,              # 1 epoch as required
    per_device_train_batch_size=4,   # Small batch for memory efficiency
    gradient_accumulation_steps=4,   # Effective batch size = 16
    warmup_steps=50,
    learning_rate=2e-4,
    fp16=True,                       # Mixed precision
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    logging_dir=LOG_DIR,
    report_to=None,                  # Disable wandb
    load_best_model_at_end=False,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    pad_to_multiple_of=8
)

# Step 7: Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Step 8: Start training
print("\n🔥 Starting training...")
start_time = datetime.now()

try:
    trainer.train()
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise

end_time = datetime.now()
training_duration = end_time - start_time
print(f"⏱ Training duration: {training_duration}")

# Step 9: Save the model
print("\n💾 Saving LoRA adapter...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Step 10: Create training log
log_data = {
    "model_name": MODEL_NAME,
    "training_start": start_time.isoformat(),
    "training_end": end_time.isoformat(),
    "training_duration": str(training_duration),
    "num_examples": len(formatted_data),
    "num_epochs": 1,
    "trainable_parameters": trainable_params,
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": lora_config.target_modules
    }
}

with open(os.path.join(LOG_DIR, "training_log.json"), 'w') as f:
    json.dump(log_data, f, indent=2)

print(f"\n🎉 Fine-tuning complete!")
print(f"📁 LoRA adapter saved to: {OUTPUT_DIR}")
print(f"📝 Training log saved to: {LOG_DIR}/training_log.json")

# Step 11: Test the model quickly
print("\n🧪 Quick test of fine-tuned model...")
test_prompt = "How do I list all files in a directory?"
inputs = tokenizer.encode(f"<|system|>\nYou are a helpful command-line assistant.\n<|user|>\n{test_prompt}\n<|assistant|>\n", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        inputs, 
        max_new_tokens=100, 
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Test response:\n{response}")

print("\n✨ All done! Ready for Phase 3.")