In [22]:
!pip install datasets trl bitsandbytes transformers peft

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, PeftModel
import torch

# Initialize model and tokenizer
checkpoint = "facebook/bart-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

def create_prompt(question, context):
    return f"""You are a SQL expert, given the Schema, generate a SQL query for the question asked
Schema: {context}
Question: {question}"""

# Test initial model
question = "How many heads of the departments are older than 56 ?"
context = "CREATE TABLE head (age INTEGER)"

# Tokenize input
inputs = tokenizer(
    create_prompt(question, context),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
).to(device)

# Generate output
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128,
    temperature=0.2,
    top_p=0.9,
    do_sample=True
)

# Decode output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Initial model output:", response)



Initial model output: You are a SQL expert, given the Schema, generate a SQL query for the question askedSchema: CREATE TABLE head (age INTEGER)Question: How many heads of the departments are older than 56 ?


In [23]:
# Load and prepare dataset
dataset_id = "b-mc2/sql-create-context"
data = load_dataset(dataset_id, split="train")
df = data.to_pandas()

# Prepare training data
def prepare_training_features(examples):
    inputs = [create_prompt(q, c) for q, c in zip(examples["question"], examples["context"])]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets
    labels = tokenizer(
        examples["answer"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert dataset to features
formatted_data = Dataset.from_pandas(df)
formatted_data = formatted_data.map(
    prepare_training_features,
    batched=True,
    remove_columns=formatted_data.column_names
)



Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [24]:
# Configure LoRA
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q_proj", "v_proj"]  # BART-specific attention modules
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="bart-base-sqllm-v1",
    per_device_train_batch_size=6,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=10,
    max_steps=500,
    fp16=True,
    predict_with_generate=True
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the final model
trainer.save_model("bart-base-sqllm-v1-final")

# Load and test the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
peft_model = PeftModel.from_pretrained(
    model,
    "bart-base-sqllm-v1-final",
    device_map="auto"
)
model = peft_model.merge_and_unload()



  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


trainable params: 4,718,592 || all params: 411,010,048 || trainable%: 1.1480


Step,Training Loss
10,13.9859
20,11.7831
30,10.5761
40,9.5535
50,8.7485
60,7.7877
70,7.0724
80,6.69
90,6.466
100,6.3918


In [25]:
# List to store generated SQL queries
generated_sqls = []

# Test cases
test_cases = [
    {
        "question": "How many heads of the departments are older than 56 ?",
        "context": "CREATE TABLE head (age INTEGER)"
    },
    {
        "question": "List the name, born state and age of the heads of departments ordered by age.",
        "context": "CREATE TABLE head (name VARCHAR, born_state VARCHAR, age VARCHAR)"
    },
    {
        "question": "What are the themes of farm competitions sorted by year in ascending order?",
        "context": "CREATE TABLE farm_competition (Theme VARCHAR, YEAR VARCHAR)"
    },
    {
        "question": "What are the maximum and minimum number of cows across all farms.",
        "context": "CREATE TABLE farm (Cows INTEGER)"
    },
    {
        "question": "How many different statuses do cities have?",
        "context": "CREATE TABLE city (Status VARCHAR)"
    }
]

# Iterate through the test cases and generate SQL queries
for test in test_cases:
    prompt = create_prompt(test["question"], test["context"])
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=128)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Store the generated SQL queries in the list
    generated_sqls.append(result)

# Print the list of generated SQL queries
print(generated_sqls)


['SELECT head FROM head WHERE age > 56', 'SELECT name FROM head WHERE born_state = "state" AND age = "age"', 'SELECT SUM(Theme) FROM farm_competition WHERE YEAR = "2017"', 'SELECT MAX(Cows) FROM farm', 'SELECT status FROM city WHERE status = "city"']


In [26]:
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import accuracy_score

In [27]:
# Function to calculate BLEU score
def calculate_bleu_score(predictions, references):
    if not predictions or not references:
        return 0.0  # Return 0 if either list is empty
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        # Tokenize the predictions and references (split by spaces)
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        bleu_score = sentence_bleu([ref_tokens], pred_tokens)
        bleu_scores.append(bleu_score)
    return sum(bleu_scores) / len(bleu_scores)


In [28]:
reference_sqls = [
    "SELECT COUNT(*) FROM head WHERE age > 56",
    "SELECT name, born_state, age FROM head ORDER BY age",
    "SELECT Theme FROM farm_competition ORDER BY YEAR",
    "SELECT MAX(Cows), MIN(Cows) FROM farm",
    "SELECT COUNT(DISTINCT Status) FROM city"
]

# Now, calculate BLEU score and Exact Match
bleu = calculate_bleu_score(generated_sqls, reference_sqls)

print(f"BLEU score: {bleu:.4f}")

BLEU score: 0.1414


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
