In [None]:
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import os
from datetime import datetime

# Configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "/content/lora_adapter"
DATA_FILE = "/content/cli_qa.json"
LOG_DIR = "/content/logs"

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

print("🚀 Starting TinyLlama fine-tuning...")
print(f"📅 Start time: {datetime.now()}")
print(f"🔥 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Step 1: Load and prepare data
print("\n📊 Loading Q&A data...")
try:
    with open(DATA_FILE, 'r') as f:
        qa_data = json.load(f)
    print(f"📈 Successfully loaded {len(qa_data)} Q&A pairs")
except Exception as e:
    print(f"❌ ERROR loading data: {e}")
    print("💡 Please upload your cli_qa.json file to Colab first")
    from google.colab import files
    uploaded = files.upload()
    if 'cli_qa.json' in uploaded:
        with open(DATA_FILE, 'wb') as f:
            f.write(uploaded['cli_qa.json'])
        print("✅ File uploaded successfully!")
        with open(DATA_FILE, 'r') as f:
            qa_data = json.load(f)
    else:
        print("❌ No file uploaded. Exiting.")
        exit(1)

# Step 2: Format data for training
def format_prompt(question, answer):
    """Format Q&A pair as a chat prompt for TinyLlama"""
    return f"<|system|>\nYou are a helpful command-line assistant.\n<|user|>\n{question}\n<|assistant|>\n{answer}<|end|>"

formatted_data = []
for item in qa_data:
    formatted_text = format_prompt(item['question'], item['answer'])
    formatted_data.append({"text": formatted_text})

print(f"✅ Formatted {len(formatted_data)} training examples")

# Step 3: Load tokenizer and model
print("\n🤖 Loading TinyLlama model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    print("✅ Tokenizer loaded and configured")

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    print(f"✅ Model loaded. Parameters: {model.num_parameters():,}")
except Exception as e:
    print(f"❌ ERROR loading model: {e}")
    print("💡 Try running: !pip install --upgrade transformers torch peft")
    exit(1)

# Step 4: Configure LoRA
print("\n⚙ Configuring LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none"
)

model = get_peft_model(model, lora_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"🎯 Trainable parameters: {trainable_params:,} ({trainable_params/model.num_parameters()*100:.2f}%)")

# Step 5: Prepare dataset
def tokenize_function(examples):
    """Tokenize the text data"""
    texts = examples["text"] if isinstance(examples["text"], list) else [examples["text"]]

    outputs = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    outputs["labels"] = []
    for input_ids in outputs["input_ids"]:
        labels = input_ids.copy()
        labels = [-100 if token == tokenizer.pad_token_id else token for token in labels]
        outputs["labels"].append(labels)

    return outputs

print("🔄 Creating and tokenizing dataset...")
dataset = Dataset.from_list(formatted_data)
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=["text"],
    desc="Tokenizing"
)

# Debug info
lengths = [len(sample['input_ids']) for sample in tokenized_dataset]
print(f"\n🔢 Dataset size: {len(tokenized_dataset)}")
print(f"📏 All input lengths: {list(set(lengths))}")  # Convert to list for display

# Step 6: Training configuration
print("\n🏋 Setting up training...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    logging_dir=LOG_DIR,
    report_to=None,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 7: Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Step 8: Start training
print("\n🔥 Starting training...")
start_time = datetime.now()
try:
    trainer.train()
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise

end_time = datetime.now()
training_duration = end_time - start_time
print(f"⏱ Training duration: {training_duration}")

# Step 9: Save the model
print("\n💾 Saving LoRA adapter...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Optional: Save to Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    drive_output_dir = "/content/drive/MyDrive/lora_adapter"
    os.makedirs(drive_output_dir, exist_ok=True)
    model.save_pretrained(drive_output_dir)
    tokenizer.save_pretrained(drive_output_dir)
    print(f"💾 Also saved to Google Drive at: {drive_output_dir}")
except Exception as e:
    print(f"⚠ Couldn't save to Google Drive: {e}")

# Step 10: Create training log (FIXED VERSION)
unique_lengths = list(set(lengths))  # Convert set to list properly

log_data = {
    "model_name": MODEL_NAME,
    "training_start": start_time.isoformat(),
    "training_end": end_time.isoformat(),
    "training_duration": str(training_duration),
    "num_examples": len(formatted_data),
    "num_epochs": 1,
    "trainable_parameters": trainable_params,
    "input_lengths": unique_lengths,  # Use the pre-converted list
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": list(lora_config.target_modules)  # Convert tuple to list
    }
}

log_path = os.path.join(LOG_DIR, "training_log.json")
with open(log_path, 'w') as f:
    json.dump(log_data, f, indent=2)
print(f"\n📝 Training log saved to: {log_path}")

# Step 11: Test the model
print("\n🧪 Quick test of fine-tuned model...")
test_prompt = "How do I list all files in a directory?"
input_text = f"<|system|>\nYou are a helpful command-line assistant.\n<|user|>\n{test_prompt}\n<|assistant|>\n"
inputs = tokenizer(input_text, return_tensors="pt").to('cuda')

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Test response:\n{response}")

print("\n✨ All done! Ready for Phase 3.")