# Qwen2.5-Coder-7B Fine-tuning with LoRA (v2 - Fixed)

This notebook fine-tunes Qwen2.5-Coder-7B-Instruct on your custom coding dataset.

**Before running:**
1. Runtime > Change runtime type > Select **T4 GPU**
2. Have your HuggingFace token ready

**Fixes in v2:**
- Let SFTTrainer handle PEFT setup (fixes gradient issues)
- Proper gradient checkpointing config
- Fixed inference without corrupting merge

## 1. Install Dependencies

In [None]:
!pip install -q transformers>=4.45.0 datasets accelerate>=0.30.0 peft>=0.11.0 trl>=0.9.0 bitsandbytes huggingface_hub

## 2. Login to HuggingFace

In [None]:
from huggingface_hub import login
login()

## 3. Check GPU

In [None]:
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    raise RuntimeError("No GPU detected! Go to Runtime > Change runtime type > T4 GPU")

## 4. Load Dataset

In [None]:
from datasets import load_dataset

train_dataset = load_dataset(
    "goodknightleo/qwen-coder-training-data",
    data_files="train.jsonl",
    split="train"
)

eval_dataset = load_dataset(
    "goodknightleo/qwen-coder-training-data",
    data_files="valid.jsonl",
    split="train"
)

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"\nSample keys: {list(train_dataset[0].keys())}")

## 5. Setup Model & Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager",  # Avoid flash attention issues
)

print("Model loaded!")

## 6. Configure LoRA & Training

In [None]:
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# Training config
training_args = SFTConfig(
    output_dir="./qwen-coder-finetuned",
    
    # Hub
    push_to_hub=True,
    hub_model_id="goodknightleo/qwen-coder-7b-finetuned",
    hub_strategy="every_save",
    
    # Training
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    max_seq_length=1024,
    
    # Memory optimization
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=True,
    optim="paged_adamw_8bit",
    
    # Logging
    logging_steps=1,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,
    
    # Eval
    eval_strategy="steps",
    eval_steps=20,
    
    # Other
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    report_to="none",
    dataset_text_field="messages",
)

print("Configuration ready!")

## 7. Train

In [None]:
# Let SFTTrainer handle PEFT setup properly
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    processing_class=tokenizer,
    peft_config=peft_config,
)

# Verify gradients are enabled
trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in trainer.model.parameters())
print(f"Trainable: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")

if trainable_params == 0:
    raise RuntimeError("No trainable parameters! Something is wrong.")

print("\nStarting training (~20-40 min on T4)...")
print("-" * 50)

trainer.train()

## 8. Save to Hub

In [None]:
print("Saving model...")
trainer.save_model()

print("Pushing to Hub...")
trainer.push_to_hub()

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print("\nModel: https://huggingface.co/goodknightleo/qwen-coder-7b-finetuned")

## 9. Test the Model

In [None]:
# Reload for clean inference (don't merge 4-bit)
from peft import PeftModel
import torch

# Clear memory
del trainer
del model
torch.cuda.empty_cache()

print("Loading base model for inference...")
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-7B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, "./qwen-coder-finetuned")
model.eval()

print("Ready for inference!")

In [None]:
# Test generation
messages = [
    {"role": "system", "content": "You are an expert software engineer."},
    {"role": "user", "content": "Write a Python function to check if a string is a palindrome."}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

print("Generating...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
print("\n" + "="*50)
print("Model Response:")
print("="*50)
print(response)