# ðŸš€ CenQuery: Llama 3 8B LoRA Fine-Tuning
**Project:** Indian Census Text-to-SQL  
**Hardware:** Runs on T4 GPU (Free Colab Tier)

This notebook fine-tunes the `defog/llama-3-sqlcoder-8b` model on your custom Census dataset using QLoRA (4-bit quantization).

## 1. Install Dependencies
We need specific versions of `peft`, `bitsandbytes`, and `transformers` to run 4-bit training.

In [None]:
!pip install -q -U torch==2.2.1 torchvision torchaudio
!pip install -q -U transformers>=4.40.0
!pip install -q -U datasets>=2.19.0
!pip install -q -U peft>=0.10.0
!pip install -q -U bitsandbytes>=0.43.0
!pip install -q -U trl>=0.8.6
!pip install -q -U accelerate>=0.29.0
!pip install -q scipy

## 2. Upload Training Data
Upload the **`consolidated_train.jsonl`** file that your team created.

In [None]:
from google.colab import files
import os

# Check if file already exists to avoid re-uploading
if not os.path.exists('consolidated_train.jsonl'):
    print("ðŸ“‚ Please upload 'consolidated_train.jsonl'...")
    uploaded = files.upload()
    # Rename if necessary to ensure it matches the expected filename
    for filename in uploaded.keys():
        if filename != 'consolidated_train.jsonl':
            os.rename(filename, 'consolidated_train.jsonl')
            print(f"   Renamed {filename} to consolidated_train.jsonl")
else:
    print("âœ… 'consolidated_train.jsonl' found.")

## 3. Run QLoRA Training
This script loads the 8B model in 4-bit mode and fine-tunes it on your data.

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

# --- Configuration ---
MODEL_NAME = "defog/llama-3-sqlcoder-8b"
NEW_MODEL_NAME = "llama-3-8b-census-sql-adapter"
TRAIN_DATA_PATH = "consolidated_train.jsonl"
OUTPUT_DIR = "./results"

# LoRA Params
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.05

# Training Params
NUM_EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUMULATION = 4
LEARNING_RATE = 2e-4

def train():
    print(f"ðŸš€ Initializing Training for {MODEL_NAME}...")

    # 1. Quantization Config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

    # 2. Load Base Model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        use_cache=False
    )
    model.config.pretraining_tp = 1

    # 3. Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 4. Load Dataset
    dataset = load_dataset("json", data_files=TRAIN_DATA_PATH, split="train")
    print(f"âœ… Loaded {len(dataset)} training examples.")

    # 5. LoRA Config
    peft_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )

    # 6. Training Arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUMULATION,
        optim="paged_adamw_32bit",
        save_steps=25,
        logging_steps=5,
        learning_rate=LEARNING_RATE,
        weight_decay=0.001,
        fp16=False,
        bf16=False,
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to="none"
    )

    # 7. Trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=2048,
        tokenizer=tokenizer,
        args=training_args,
        packing=False,
    )

    # 8. Train
    print("ðŸ”¥ Starting Training...")
    trainer.train()

    # 9. Save Locally
    print(f"ðŸ’¾ Saving adapter locally to {NEW_MODEL_NAME}...")
    trainer.model.save_pretrained(NEW_MODEL_NAME)
    tokenizer.save_pretrained(NEW_MODEL_NAME)
    return trainer

# Run the function
trainer = train()

## 4. Save Adapter to Google Drive
Mount your Google Drive to save the trained model permanently. This way, if Colab disconnects, you don't lose your work.

In [None]:
from google.colab import drive
import shutil

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Define Destination
DESTINATION_FOLDER = "/content/drive/MyDrive/CenQuery_Adapter"

# 3. Copy files
print(f"ðŸ’¾ Copying model files to {DESTINATION_FOLDER}...")
if os.path.exists(DESTINATION_FOLDER):
    shutil.rmtree(DESTINATION_FOLDER)
shutil.copytree("llama-3-8b-census-sql-adapter", DESTINATION_FOLDER)

print("âœ… Success! Adapter saved to Google Drive.")