In [None]:
import json
import numpy as np
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
# import torch.nn as nn
from datasets import load_dataset, Dataset

In [None]:
device = "mps"

In [None]:
# model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map = device)

In [None]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r = 8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [None]:
def dataset_creation(n: int) -> Dataset:
    """Loads and trims the text-to-SQL dataset."""
    dataset = load_dataset("gretelai/synthetic_text_to_sql")
    columns_to_keep = ['sql_prompt', 'sql_context', 'sql']

    # Filter only required columns
    dataset = dataset["train"].remove_columns(
        [col for col in dataset["train"].column_names if col not in columns_to_keep]
    )

    return dataset.select(range(n))

In [None]:
def tokenize_fn(batch):
    prompts = []
    sql_outputs = []

    for sql_prompt, sql_context, sql in zip(batch["sql_prompt"], batch["sql_context"], batch["sql"]):
        # Build the prompt using chat format
        prompt = tokenizer.apply_chat_template(
            [
                {"role": "system", "content": "You are a helpful assistant that converts SQL prompts into SQL queries."},
                {"role": "user", "content": json.dumps({
                    "sql_prompt": sql_prompt,
                    "sql_context": sql_context
                })}
            ],
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)

        # Format SQL output with EOS
        sql_output = f'[{{"sql": "{sql.strip()}"}}]{tokenizer.eos_token}'
        sql_outputs.append(sql_output)

    # Full sequence = prompt + sql_output
    full_texts = [p + o for p, o in zip(prompts, sql_outputs)]

    # Tokenize full sequences
    full_encodings = tokenizer(
        full_texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = full_encodings["input_ids"]
    attention_mask = full_encodings["attention_mask"]

    # Get prompt lengths to mask label tokens for prompts
    prompt_lengths = [
        len(tokenizer(p, truncation=True, max_length=512)["input_ids"])
        for p in prompts
    ]

    labels = input_ids.clone()
    for i, prompt_len in enumerate(prompt_lengths):
        labels[i, :prompt_len] = -100  # Mask prompt tokens

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [None]:
df = dataset_creation(5000).train_test_split(0.05)

In [None]:
train_df = df["train"]
eval_df = df["test"]

In [None]:
batch = train_df

In [None]:
train_df = train_df.map(tokenize_fn, batched=True)
eval_df = eval_df.map(tokenize_fn, batched=True)

Map:   0%|          | 0/4750 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
# print(tokenizer.special_tokens_map)

In [None]:

import numpy as np
import evaluate

# Load BLEU metric once
bleu_metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Pad labels and predictions to same shape if needed
    if logits.shape[-1] != labels.shape[-1]:
        labels = labels[:, :logits.shape[1]]

    # Predicted token IDs (greedy)
    predictions = np.argmax(logits, axis=-1)

    # Replace -100 in labels with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    print(decoded_preds)
    print(decoded_labels)

    # Clean whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # BLEU requires list-of-references format
    result = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    # Exact Match
    exact_match = np.mean([
        int(pred == label)
        for pred, label in zip(decoded_preds, decoded_labels)
    ])

    return {
        "bleu": round(result["bleu"] * 100, 2),          # BLEU score %
        "exact_match": round(exact_match * 100, 2)       # EM score %
    }




In [None]:
training_args = TrainingArguments(
    output_dir="sql_generator_improved",

    # Basic training setup
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=False,
    # # Evaluation, logging, and saving
    # eval_strategy="epoch",
    # logging_strategy="steps",
    # logging_steps=10,
    # save_strategy="epoch",
    # save_total_limit=2,
    
    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_total_limit=10,
    save_steps = 50,
    eval_steps=50,

    # Learning parameters
    learning_rate=2e-4,
    warmup_steps=50,
    weight_decay=0.01,
    label_names=["labels"],

    # Device & precision
    fp16=False,
    bf16=False,
    dataloader_pin_memory=False,

    # Meta & checkpointing
    seed=42,
    report_to="tensorboard",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
)


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=eval_df,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    processing_class = tokenizer,
)

In [None]:
# trainer.evaluate()

In [None]:
# trainer.train()

Step,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 25.74 GB, other allocations: 1.10 GB, max allowed: 45.90 GB). Tried to allocate 19.57 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
max_retries = 55
final_model_dir = "/Users/mann.kurani.ctr/Desktop/Personal_Code/SQL Finetune/models/finetuned_final_tinyllama"

for attempt in range(max_retries):
    try:
        # if attempt == 0:
        #     # Start fresh
        #     trainer.train()
        # else:
        #     # Resume from checkpoint
        #     trainer.train(resume_from_checkpoint=True)
        trainer.train(resume_from_checkpoint=True)
        trainer.model.save_pretrained(final_model_dir)
        print("Training complete. Model saved.")
        break

    except RuntimeError as e:
        print(f"[Attempt {attempt + 1}] RuntimeError during training: {e}")
        print("Retrying after short delay...")
        time.sleep(10)
        torch.mps.empty_cache()
else:
    print("Training failed after maximum retries.")