## Section 1: Environment Setup & Repository Clone

In [None]:
# Check GPU availability
!nvidia-smi
!pwd
!ls

In [None]:
# Clone the repository
!git clone https://github.com/Japyh/llm-based-dbms.git
%cd llm-based-dbms

# Verify data files exist
!ls -lh
!ls -lh data/

In [None]:
# Install base dependencies from requirements.txt
!pip install -q -r requirements.txt

# Install fine-tuning specific packages
!pip install -q transformers datasets accelerate bitsandbytes peft trl scikit-learn

# Verify installations
import transformers
import peft
import trl
print(f"Transformers: {transformers.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"TRL: {trl.__version__}")

## Section 2: Load and Prepare Dataset

In [None]:
import os
import json
import sys
from pathlib import Path
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Add src to path for imports
sys.path.append(".")
from src.config import BASE_HF_NL2SQL_MODEL

print(f"Base model for fine-tuning: {BASE_HF_NL2SQL_MODEL}")

In [None]:
# Load the chat-style NLâ†’SQL dataset
dataset_path = Path("data/nl2sql_train_chat_raw.jsonl")

examples = []
with open(dataset_path, "r", encoding="utf-8") as f:
    for line in f:
        examples.append(json.loads(line))

print(f"Loaded {len(examples)} examples")
print(f"\nFirst example structure:")
print(json.dumps(examples[0], indent=2))

In [None]:
# Process examples: separate prompt messages from response
processed = []
for ex in examples:
    messages = ex["messages"]
    # System + User = prompt, Assistant = response
    prompt_messages = messages[:-1]  # All except last (assistant)
    response = messages[-1]["content"]  # Assistant's SQL
    
    processed.append({
        "prompt_messages": prompt_messages,
        "response": response
    })

print(f"Processed {len(processed)} examples")
print(f"\nSample processed example:")
print(f"Prompt: {processed[0]['prompt_messages']}")
print(f"Response: {processed[0]['response']}")

In [None]:
# Split into train/validation (80/20)
train_data, val_data = train_test_split(
    processed,
    test_size=0.2,
    random_state=42
)

print(f"Train examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")

# Create Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

print(f"\nDataset structure: {dataset_dict}")

## Section 3: Load Tokenizer and Base Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Optional: HF token for gated models (set in Kaggle Secrets as HF_TOKEN if needed)
hf_token = os.environ.get("HF_TOKEN", None)

print(f"Loading tokenizer from {BASE_HF_NL2SQL_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(
    BASE_HF_NL2SQL_MODEL,
    use_auth_token=hf_token
)

# Ensure pad_token is set (use eos_token as fallback)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set pad_token to eos_token")

print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")

In [None]:
# Configure 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"Loading base model {BASE_HF_NL2SQL_MODEL} with 4-bit quantization...")
print("This may take a few minutes...")

model = AutoModelForCausalLM.from_pretrained(
    BASE_HF_NL2SQL_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_token
)

print("âœ“ Model loaded successfully!")
print(f"Model device map: {model.hf_device_map}")

## Section 4: Format Dataset with Chat Template

In [None]:
def format_chat_example(example):
    """
    Format a single example using the model's chat template.
    Creates a 'text' field with prompt + response for training.
    """
    prompt_messages = example["prompt_messages"]
    response = example["response"]
    
    # Apply chat template to get formatted prompt
    prompt = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Combine prompt + response for training
    text = prompt + response
    
    return {"text": text}

# Apply formatting to both train and validation sets
formatted_ds = dataset_dict.map(
    format_chat_example,
    remove_columns=dataset_dict["train"].column_names
)

print(f"Formatted dataset: {formatted_ds}")
print(f"\nSample formatted text (first 500 chars):")
print(formatted_ds["train"][0]["text"][:500])

## Section 5: Configure LoRA and Training Arguments

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments

# Prepare model for k-bit training (required for QLoRA)
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=64,                      # LoRA rank
    lora_alpha=16,             # LoRA alpha (scaling factor)
    lora_dropout=0.1,          # Dropout for LoRA layers
    bias="none",               # Don't train biases
    task_type="CAUSAL_LM",     # Causal language modeling
    target_modules=["q_proj", "v_proj"]  # Apply LoRA to attention matrices
)

print("LoRA Configuration:")
print(lora_config)

In [None]:
# Training hyperparameters
from trl import SFTConfig

training_args = SFTConfig(
    output_dir="./nl2sql-mistral-lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size: 2Ã—4 = 8
    learning_rate=1e-4,
    num_train_epochs=3,
    bf16=True,                      # Use bfloat16 mixed precision
    logging_steps=10,
    eval_strategy="steps",          # Fixed: was evaluation_strategy (deprecated)
    eval_steps=50,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",               # No external logging
    warmup_steps=10,
    optim="paged_adamw_8bit",       # Memory-efficient optimizer
    dataset_text_field="text",      # Text field name in dataset
    max_length=1024,                # Changed from max_seq_length (correct SFTConfig param)
)

print("Training Arguments:")
print(training_args)

In [None]:
# Initialize SFTTrainer (Supervised Fine-Tuning Trainer from TRL)
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=formatted_ds["train"],
    eval_dataset=formatted_ds["validation"],
    peft_config=lora_config,
    args=training_args,
)

print("âœ“ Trainer initialized successfully!")
print(f"Number of training examples: {len(formatted_ds['train'])}")
print(f"Number of validation examples: {len(formatted_ds['validation'])}")
print(f"\nEstimated training steps: {len(formatted_ds['train']) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

## Section 6: Run Training

**Note**: This will take 30-60 minutes on Kaggle T4Ã—2 depending on your dataset size and hyperparameters.

If you encounter **GPU OOM errors**, try:
- Reduce `max_seq_length` to 512 or 768
- Reduce `per_device_train_batch_size` to 1
- Increase `gradient_accumulation_steps` to maintain effective batch size

In [None]:
# Start training
print("Starting fine-tuning...")
print("=" * 70)

trainer.train()

print("=" * 70)
print("âœ“ Training complete!")

## Section 7: Save Fine-Tuned Adapter

In [None]:
# Save the fine-tuned LoRA adapter and tokenizer
output_dir = Path("nl2sql-mistral-lora")
output_dir.mkdir(exist_ok=True)

print(f"Saving fine-tuned adapter to {output_dir}...")

# Save adapter weights
trainer.model.save_pretrained(output_dir)

# Save tokenizer (important for inference)
tokenizer.save_pretrained(output_dir)

print("âœ“ Adapter and tokenizer saved!")
print(f"\nOutput directory contents:")
for item in sorted(output_dir.iterdir()):
    size = item.stat().st_size / (1024 * 1024)  # Size in MB
    print(f"  {item.name:40s} {size:8.2f} MB")

## Section 8: Quick Smoke Test (Optional)

Test the fine-tuned model with a sample question to verify it works before downloading.

In [None]:
# Prepare model for inference
print("Preparing model for inference...")

# The model from trainer is already a PEFT model, just put in eval mode
model.eval()

# Fix dtype mismatch: Convert lm_head to bfloat16 to match compute dtype
# For PEFT models, we need to access the base model properly
try:
    # PEFT models have a base_model attribute that wraps the actual model
    if hasattr(model, 'base_model') and hasattr(model.base_model, 'model'):
        # Structure: PeftModel.base_model (LoraModel).model (MistralForCausalLM)
        base = model.base_model.model
    elif hasattr(model, 'model'):
        # Alternative: direct .model access
        base = model.model
    else:
        # Fallback: model is already the base
        base = model
    
    # Convert lm_head to bfloat16
    if hasattr(base, 'lm_head'):
        base.lm_head = base.lm_head.to(torch.bfloat16)
        print(f"âœ“ Converted lm_head to bfloat16 (dtype: {base.lm_head.weight.dtype})")
    else:
        print("Warning: Could not find lm_head attribute")
except Exception as e:
    print(f"Warning: Could not convert lm_head dtype: {e}")

# Disable gradient computation for inference
torch.cuda.empty_cache()

print("âœ“ Model ready for inference")

In [None]:
def test_nl2sql(question: str):
    """
    Test the fine-tuned model with a natural language question.
    """
    # Build messages
    messages = [
        {
            "role": "system",
            "content": "You are a Text-to-SQL assistant for our SQLite sales database. "
                      "Return only a valid SQL SELECT query, with no explanation, no comments, "
                      "and no natural language. Never modify data or schema."
        },
        {
            "role": "user",
            "content": question
        }
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True)
    
    # Move inputs to model's device
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    with torch.no_grad():
        # Use the PEFT model's generate method
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the generated tokens (exclude prompt)
    response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    
    return response.strip()

print("Testing fine-tuned model...\n")

In [None]:
# Test with a few example questions
test_questions = [
    "Show me the top 10 customers by total sales.",
    "How many products are in each product line?",
    "What is the total revenue?",
    "List all customers from France."
]

for i, question in enumerate(test_questions, 1):
    print(f"Test {i}: {question}")
    sql = test_nl2sql(question)
    print(f"SQL: {sql}")
    print("-" * 70)

## Next Steps

1. **Download the adapter**: In Kaggle, go to Output â†’ Download `nl2sql-mistral-lora/`
2. **Place in your repo**: Put the downloaded folder at `models/nl2sql-mistral-lora/`
3. **Implement LocalHFLLMProvider**: Complete the TODO sections in `src/llm/provider.py`
4. **Run evaluation**: Use `notebooks/04-kaggle-eval-nl2sql.ipynb` to validate performance
5. **Integrate**: Wire up `NL2SQLEngine` to use `LocalHFLLMProvider`

**Congratulations!** ðŸŽ‰ You've successfully fine-tuned a specialized NLâ†’SQL model!