In [None]:
from google.colab import drive
drive.mount('/content/drive')


MessageError: Error: credential propagation was unsuccessful

In [None]:
#model loading
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Path to your model folder in Google Drive
model_path = '/content/drive/MyDrive/TextToSQL_Project/t5-spider-finetuned-final'

# Load model and tokenizer from the saved folder
model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path=model_path, local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained(pretrained_model_name_or_path=model_path, local_files_only=True)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The previous attempts to load the model using `from_pretrained` with a local path resulted in an error because the method still attempted to validate the path as a Hugging Face repository ID.

To reliably load the model and tokenizer from the local Google Drive folder, we will load the model configuration and state dictionary separately. This approach is more explicit for local loading and avoids the repository ID validation issue.

In [None]:
def generate_sql(question):
    input_text = "translate English to SQL: " + question
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs.input_ids,
        max_length=256,
        num_beams=4,
        early_stopping=True
    )
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

# Example usage
print(generate_sql("List all customers with orders above 1000"))


In [None]:
!pip install nltk rouge_score scikit-learn --quiet


In [None]:
from datasets import load_dataset
from tqdm import tqdm

# Load the Spider validation split
spider_dataset = load_dataset('spider')
val_questions = spider_dataset['validation']['question']
val_sql_references = spider_dataset['validation']['query']

# Generate predictions for entire validation set
predictions = []
for question in tqdm(val_questions, desc="Generating SQL predictions"):
    pred_sql = generate_sql(question)
    predictions.append(pred_sql)


In [None]:
# Explore Database Domains
# Analyze database diversity
from collections import Counter
def explore_database_domains(dataset_split):
    db_domains = Counter()
    db_questions = {}

    for example in dataset_split:
        db_id = example['db_id']
        db_domains[db_id] += 1

        if db_id not in db_questions:
            db_questions[db_id] = []
        db_questions[db_id].append(example['question'])

    return db_domains, db_questions

db_stats, db_questions = explore_database_domains(spider_dataset['train'])

print(f"\n=== DATABASE DOMAIN ANALYSIS ===")
print(f"Total unique databases: {len(db_stats)}")
print(f"Average questions per database: {sum(db_stats.values()) / len(db_stats):.1f}")

# Show top 10 databases by question count
print("\nTop 10 databases by question count:")
for db_id, count in db_stats.most_common(10):
    print(f"- {db_id}: {count} questions")


In [None]:
#Examine Query Complexity Distribution
import pandas as pd
from collections import Counter

# Analyze SQL complexity patterns
def analyze_sql_complexity(dataset_split):
    complexity_analysis = {
        'joins': 0,
        'nested_queries': 0,
        'group_by': 0,
        'order_by': 0,
        'having': 0,
        'union': 0
    }

    for example in dataset_split:
        sql = example['query'].upper()
        if 'JOIN' in sql:
            complexity_analysis['joins'] += 1
        if any(word in sql for word in ['SELECT', 'FROM']) and sql.count('SELECT') > 1:
            complexity_analysis['nested_queries'] += 1
        if 'GROUP BY' in sql:
            complexity_analysis['group_by'] += 1
        if 'ORDER BY' in sql:
            complexity_analysis['order_by'] += 1
        if 'HAVING' in sql:
            complexity_analysis['having'] += 1
        if 'UNION' in sql:
            complexity_analysis['union'] += 1

    return complexity_analysis

# Analyze training set complexity
train_complexity = analyze_sql_complexity(spider_dataset['train'])
print("\n=== SQL COMPLEXITY ANALYSIS ===")
for feature, count in train_complexity.items():
    percentage = (count / len(spider_dataset['train'])) * 100
    print(f"{feature.replace('_', ' ').title()}: {count} queries ({percentage:.1f}%)")


# Evaluate Model Performance Using *Metrics*

In [None]:
# Install necessary packages if not installed:
# !pip install nltk rouge_score scikit-learn --quiet

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# Prepare references and predictions
references = [sql.strip().lower() for sql in val_sql_references]
preds = [pred.strip().lower() for pred in predictions]

# BLEU score (sentence-level with smoothing)
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for ref, pred in zip(references, preds)
]
print(f"Average BLEU score: {sum(bleu_scores) / len(bleu_scores):.4f}")

# ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1 = rouge2 = rougeL = 0
for ref, pred in zip(references, preds):
    scores = scorer.score(ref, pred)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougeL += scores['rougeL'].fmeasure
n = len(preds)
print(f"Avg ROUGE-1 F1: {rouge1 / n:.4f}")
print(f"Avg ROUGE-2 F1: {rouge2 / n:.4f}")
print(f"Avg ROUGE-L F1: {rougeL / n:.4f}")

# Token-level F1 score (macro)
token_set = set()
for ref, pred in zip(references, preds):
    token_set.update(ref.split())
    token_set.update(pred.split())

mlb = MultiLabelBinarizer(classes=list(token_set))
refs_bin = mlb.fit_transform([set(r.split()) for r in references])
preds_bin = mlb.transform([set(p.split()) for p in preds])
f1 = f1_score(refs_bin, preds_bin, average='macro')
print(f"Macro token-level F1 score: {f1:.4f}")

# Exact Match Accuracy
exact_matches = [ref == pred for ref, pred in zip(references, preds)]
print(f"Exact Match Accuracy: {sum(exact_matches) / len(exact_matches):.4f}")


In [None]:
import pandas as pd

df_results = pd.DataFrame({
    'question': val_questions,
    'reference_sql': val_sql_references,
    'predicted_sql': predictions
})

df_results.to_csv('validation_predictions.csv', index=False)
print("Saved predictions to validation_predictions.csv")


Step 1: Setup and Prepare Your Data for Retrieval


In [None]:
# Example documents to index for retrieval
documents = [
    "Customer table contains id, name, address, and order history details.",
    "Orders table has order id, customer id, product id, quantity, and price.",
    "Products table lists product id, name, category, and stock level."
]


Step 2: Create Document Embeddings with Sentence Transformers


In [None]:
!pip install sentence-transformers faiss-cpu --quiet

from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and accurate embedding model

# Compute embeddings for your knowledge base documents
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)


Step 3: Build a FAISS Index for Efficient Vector Search

In [None]:
import faiss

embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance index
index.add(doc_embeddings)

print(f"Indexed {index.ntotal} documents")


Step 4: Define Retrieval Function

In [None]:
def retrieve(query, k=2):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    retrieved_docs = [documents[idx] for idx in indices[0]]
    return retrieved_docs


Step 5: Load Your Fine-Tuned T5 Model and Tokenizer for Generation

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = '/content/drive/MyDrive/TextToSQL_Project/t5-spider-finetuned-final'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)


Step 6: Combine Retriever and Generator in RAG Pipeline

In [None]:
def rag_generate_sql(question, k=1, max_gen_len=150):
    context_docs = retrieve(question, k=k)
    print("Retrieved docs:", context_docs)

    context_text = " ; ".join(context_docs)
    input_text = f"translate English to SQL: {question} Context: {context_text} ###"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_gen_len,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
    )
    generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sql


Step 7: Test Your RAG Pipeline
python

In [None]:
sample_question = "List all customers"
print("Generated SQL:")
print(rag_generate_sql(sample_question))


In [None]:
sample_question = "List all customers with orders above 1000"
print("Generated SQL:")
print(rag_generate_sql(sample_question))


In [None]:
sample_question = "Give me name of the department"
print("Generated SQL:")
print(rag_generate_sql(sample_question))


Step 8: Evaluate the RAG System

In [None]:
from datasets import load_dataset
from tqdm import tqdm

# Load validation data questions and references
spider_dataset = load_dataset('spider')
val_questions = spider_dataset['validation']['question']

val_references = spider_dataset['validation']['query']

predictions = []
for question in tqdm(val_questions, desc="Generating RAG SQL predictions"):
    pred_sql = rag_generate_sql(question)
    predictions.append(pred_sql)



evaluation metrics

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# Prepare references and predictions (lowercase and stripped)
references = [sql.strip().lower() for sql in val_references]
preds = [pred.strip().lower() for pred in predictions]

# BLEU Score with smoothing
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for ref, pred in zip(references, preds)
]
print(f"Average BLEU score: {sum(bleu_scores) / len(bleu_scores):.4f}")

# ROUGE Scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1 = rouge2 = rougeL = 0
for ref, pred in zip(references, preds):
    scores = scorer.score(ref, pred)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougeL += scores['rougeL'].fmeasure
n = len(preds)
print(f"Avg ROUGE-1 F1: {rouge1 / n:.4f}")
print(f"Avg ROUGE-2 F1: {rouge2 / n:.4f}")
print(f"Avg ROUGE-L F1: {rougeL / n:.4f}")

# Token-level F1 Score
all_ref_tokens = [ref.split() for ref in references]
all_pred_tokens = [pred.split() for pred in preds]
tokens = set(sum(all_ref_tokens, []) + sum(all_pred_tokens, []))
mlb = MultiLabelBinarizer(classes=list(tokens))
refs_bin = mlb.fit_transform([set(r) for r in all_ref_tokens])
preds_bin = mlb.transform([set(p) for p in all_pred_tokens])
f1 = f1_score(refs_bin, preds_bin, average='macro')
print(f"Macro token-level F1 score: {f1:.4f}")

# Exact Match Accuracy
exact_matches = [ref == pred for ref, pred in zip(references, preds)]
print(f"Exact Match Accuracy: {sum(exact_matches) / len(exact_matches):.4f}")


In [None]:
import pandas as pd

df_results = pd.DataFrame({
    'question': val_questions,
    'reference_sql': val_references,
    'predicted_sql': predictions
})

df_results.to_csv('rag_validation_predictions.csv', index=False)
print("Saved validation predictions to rag_validation_predictions.csv")


# Install all required packages
!pip install transformers datasets sentence-transformers faiss-cpu rouge_score nltk scikit-learn tqdm --quiet

import os
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('punkt')

# --- Set device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Step 1: Load fine-tuned T5 model and tokenizer ---

model_path = '/content/drive/MyDrive/TextToSQL_Project/t5-spider-finetuned-final'  # CHANGE to your model directory
model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)
model.to(device)
print("Loaded fine-tuned T5 model and tokenizer")

# --- Step 2: Prepare retrieval knowledge base and build improved retriever ---

# Replace this list with your actual schema/docs or SQL examples as retrieval corpus
documents = [
    "Customer table with id, name, address, and order details.",
    "Orders table contains order_id, customer_id, product_id, quantity, price.",
    "Products table contains product_id, name, category, and stock quantity.",
    "Customers who have orders greater than 1000.",
    "Product categories include electronics, furniture, clothing, and toys."
]

print("Loading Sentence Transformer embedder...")
embedder = SentenceTransformer('all-mpnet-base-v2')  # Stronger embedding model

print("Computing document embeddings...")
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)

embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(doc_embeddings)
print(f"FAISS index built with {index.ntotal} documents for retrieval.")

# --- Step 3: Retrieval function ---

def retrieve(query, k=2):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    retrieved = [documents[i] for i in indices[0]]
    return retrieved

# --- Step 4: Further fine-tune the T5 model ---

# Load Spider dataset for training & validation
spider_dataset = load_dataset('spider')

# Preprocessing function (adjust if needed)
max_input_length = 512
max_target_length = 256

def preprocess_function(examples):
    inputs = ["translate English to SQL: " + q.strip() for q in examples["question"]]
    targets = [sql.strip() for sql in examples["query"]]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing training and validation data...")
tokenized_train = spider_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = spider_dataset["validation"].map(preprocess_function, batched=True)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_validation.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


training_args = TrainingArguments(
    output_dir="./t5-finetuned-continued",
    num_train_epochs=3,              # Increase epochs for further fine-tuning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,              # Lower LR for stable tuning
    # eval_strategy="steps",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_steps=100,
    fp16=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
)

print("Starting continued fine-tuning...")
trainer.train()
trainer.save_model('./t5-finetuned-continued')
tokenizer.save_pretrained('./t5-finetuned-continued')
save_path = '/content/drive/MyDrive/t5-finetuned-continued'

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Fine-tuning complete and model saved.")

model.save_pretrained("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-continued")
tokenizer.save_pretrained("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-continued")


# --- Step 5: Retrieval-Augmented Generation (RAG) function with improved prompt and generation controls ---

def rag_generate_sql(question, k=2, max_gen_len=150):
    retrieved_docs = retrieve(question, k=k)
    print(f"Retrieved {k} documents for query: {question}\n- " + "\n- ".join(retrieved_docs))

    context_text = " ; ".join(retrieved_docs)
    input_text = f"translate English to SQL: {question} Context: {context_text} ###"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    output_ids = model.generate(
        inputs.input_ids,
        max_length=max_gen_len,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0,
        length_penalty=1.0,
    )

    generated_sql = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_sql

# --- Step 6: Generate predictions on Spider validation set ---

val_questions = spider_dataset['validation']['question']
val_references = spider_dataset['validation']['query']

print("Generating RAG-enhanced SQL predictions for validation set (this will take some time)...")
predictions = []
for q in tqdm(val_questions):
    pred_sql = rag_generate_sql(q, k=2, max_gen_len=150)
    predictions.append(pred_sql)

# --- Step 7: Evaluate the predictions ---

# Prepare for evaluation
references = [r.strip().lower() for r in val_references]
preds = [p.strip().lower() for p in predictions]

# BLEU with smoothing
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for ref, pred in zip(references, preds)
]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_sum = rouge2_sum = rougeL_sum = 0
for ref, pred in zip(references, preds):
    score = scorer.score(ref, pred)
    rouge1_sum += score['rouge1'].fmeasure
    rouge2_sum += score['rouge2'].fmeasure
    rougeL_sum += score['rougeL'].fmeasure
n = len(preds)
avg_rouge1 = rouge1_sum / n
avg_rouge2 = rouge2_sum / n
avg_rougeL = rougeL_sum / n

# Token-level Macro F1
ref_tokens = [ref.split() for ref in references]
pred_tokens = [pred.split() for pred in preds]
token_set = set(sum(ref_tokens, []) + sum(pred_tokens, []))
mlb = MultiLabelBinarizer(classes=list(token_set))
ref_bin = mlb.fit_transform([set(r) for r in ref_tokens])
pred_bin = mlb.transform([set(p) for p in pred_tokens])
macro_f1 = f1_score(ref_bin, pred_bin, average='macro')

# Exact Match
exact_matches = [ref == pred for ref, pred in zip(references, preds)]
exact_match = sum(exact_matches) / len(exact_matches)

print("\n==== Evaluation Metrics ====")
print(f"Average BLEU Score       : {avg_bleu:.4f}")
print(f"Average ROUGE-1 F1       : {avg_rouge1:.4f}")
print(f"Average ROUGE-2 F1       : {avg_rouge2:.4f}")
print(f"Average ROUGE-L F1       : {avg_rougeL:.4f}")
print(f"Macro Token-level F1     : {macro_f1:.4f}")
print(f"Exact Match Accuracy     : {exact_match:.4f}")

# --- Step 8: Save results for inspection ---

results_df = pd.DataFrame({
    'question': val_questions,
    'reference_sql': val_references,
    'predicted_sql': predictions
})

results_df.to_csv('rag_validation_predictions.csv', index=False)
print("Saved RAG validation results to rag_validation_predictions.csv")


In [None]:
# Install requirements
!pip install transformers datasets sentence-transformers faiss-cpu rouge_score nltk scikit-learn tqdm sqlparse --quiet

import os
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import sqlite3
import glob
import sqlparse

nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### 1. Load fine-tuned T5 model ###
model_path = '/content/drive/MyDrive/TextToSQL_Project/t5-spider-finetuned-final'  # change if needed
model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)
model.to(device)
print("Loaded fine-tuned T5 model and tokenizer")

### 2. Build enriched retrieval corpus ###
spider_db_dir = '/content/spider/database'

retrieval_documents = []
for db_dir in glob.glob(f"{spider_db_dir}/*"):
    db_name = os.path.basename(db_dir)
    sqlite_files = [f for f in os.listdir(db_dir) if f.endswith('.sqlite')]
    for sqlite_file in sqlite_files:
        db_path = os.path.join(db_dir, sqlite_file)
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = [row[0] for row in cursor.fetchall()]
        for table in tables:
            cursor.execute(f"PRAGMA table_info({table})")
            cols = cursor.fetchall()
            col_desc = ", ".join([f"{c[1]} ({c[2]})" for c in cols])
            retrieval_documents.append(f"Database: {db_name}. Table: {table}. Columns: {col_desc}.")
        conn.close()

print(f"Extracted {len(retrieval_documents)} schema descriptions from Spider.")

# Extra optional schemas
extra_documents = [
    "Table: Accounts. Columns: account_id (PK), customer_id (FK), account_type, balance, currency, open_date.",
    "Table: Students. Columns: student_id (PK), name, email, enrollment_year, major.",
    "Table: Courses. Columns: course_id (PK), course_name, department, credits.",
    "Relationship: Each account can have multiple transactions. Students can enroll in multiple courses.",
]
retrieval_documents += extra_documents

### 3. Build embeddings and FAISS index ###
print("Embedding and indexing retrieval corpus...")
embedder = SentenceTransformer('all-mpnet-base-v2')
doc_embeddings = embedder.encode(retrieval_documents, convert_to_numpy=True)
embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(doc_embeddings)
print(f"FAISS index built with {index.ntotal} enriched documents.")

def retrieve(query, k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    return [retrieval_documents[i] for i in indices[0]]

def find_schema_context(question, k=5):
    return "\n".join(retrieve(question, k))

### 4. Prepare Spider dataset with improved prompt ###
spider_dataset = load_dataset("xlangai/spider")

def preprocess_function(examples):
    augmented_inputs = []
    for q in examples["question"]:
        schema_ctx = find_schema_context(q, k=5)
        augmented_inputs.append(
            f"Given the database schema below, write the correct SQL query to answer the question.\n\n"
            f"Schema:\n{schema_ctx}\n\nQuestion:\n{q}\n\nSQL:"
        )
    targets = [sql.strip() for sql in examples["query"]]
    model_inputs = tokenizer(augmented_inputs, max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing Spider train/validation data with improved prompts...")
tokenized_train = spider_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = spider_dataset["validation"].map(preprocess_function, batched=True)
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_validation.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



#New Code
from itertools import product

# Define the hyperparameter grid
param_grid = {
    "learning_rate": [2e-5, 3e-5],
    "per_device_train_batch_size": [8, 16],
    "num_train_epochs": [3, 5],
    "warmup_ratio": [0.1, 0.2]
}

# Create all combinations
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in product(*values)]


from transformers import Trainer, TrainingArguments
import numpy as np

results = []

for i, params in enumerate(param_combinations):
    print(f"Running grid search combination {i+1}/{len(param_combinations)}: {params}")
    training_args = TrainingArguments(
        output_dir=f"./model_grid_{i}",
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        save_total_limit=1,
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=params["per_device_train_batch_size"],
        num_train_epochs=params["num_train_epochs"],
        warmup_ratio=params["warmup_ratio"],
        load_best_model_at_end=True,
        logging_steps=100,
        fp16=True,
        report_to=None
    )


    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
)
    # Train
    trainer.train()

    # Evaluate
    metrics = trainer.evaluate()
    metrics['params'] = params
    results.append(metrics)

### 5. Fine-tune with better settings ###


print(metrics)



#Ends New Code








# training_args = TrainingArguments(
#     output_dir="./t5-finetuned-rag",
#     num_train_epochs=10,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     learning_rate=2e-5,
#     warmup_ratio=0.1,
#     eval_strategy="steps",
#     eval_steps=500,
#     save_steps=500,
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     logging_steps=100,
#     fp16=True,
#     report_to=None
# )


# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_validation,
#     tokenizer=tokenizer,
# )

# print("Starting improved retrieval-augmented fine-tuning...")
# trainer.train()
trainer.save_model('./t5-finetuned-rag')
tokenizer.save_pretrained('./t5-finetuned-rag')

### 6. Reload for inference ###
model = T5ForConditionalGeneration.from_pretrained('./t5-finetuned-rag').to(device)
tokenizer = T5Tokenizer.from_pretrained('./t5-finetuned-rag')

def rag_generate_sql(question, k=5, max_gen_len=180):
    schema_ctx = find_schema_context(question, k)
    input_text = (
        f"Given the database schema below, write the correct SQL query to answer the question.\n\n"
        f"Schema:\n{schema_ctx}\n\nQuestion:\n{question}\n\nSQL:"
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024).to(device)
    output_ids = model.generate(
        inputs.input_ids,
        max_length=max_gen_len,
        num_beams=8,
        early_stopping=True,
        no_repeat_ngram_size=2,
        repetition_penalty=1.2,
        length_penalty=1.0,
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

### 7. Evaluation with canonicalization ###
val_questions = spider_dataset['validation']['question']
val_references = spider_dataset['validation']['query']

print("Generating improved RAG SQL predictions for validation set...")
predictions = [rag_generate_sql(q) for q in tqdm(val_questions)]

def canonicalize_sql(sql):
    try:
        return " ".join(sqlparse.format(sql, reindent=True, keyword_case='upper').split())
    except:
        return sql.strip()

references = [canonicalize_sql(r.lower()) for r in val_references]
preds = [canonicalize_sql(p.lower()) for p in predictions]

smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
               for ref, pred in zip(references, preds)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_sum = rouge2_sum = rougeL_sum = 0
for ref, pred in zip(references, preds):
    score = scorer.score(ref, pred)
    rouge1_sum += score['rouge1'].fmeasure
    rouge2_sum += score['rouge2'].fmeasure
    rougeL_sum += score['rougeL'].fmeasure

n = len(preds)
avg_rouge1 = rouge1_sum / n
avg_rouge2 = rouge2_sum / n
avg_rougeL = rougeL_sum / n

ref_tokens = [ref.split() for ref in references]
pred_tokens = [pred.split() for pred in preds]
token_set = set(sum(ref_tokens, []) + sum(pred_tokens, []))
mlb = MultiLabelBinarizer(classes=list(token_set))
ref_bin = mlb.fit_transform([set(r) for r in ref_tokens])
pred_bin = mlb.transform([set(p) for p in pred_tokens])
macro_f1 = f1_score(ref_bin, pred_bin, average='macro')

exact_matches = [ref == pred for ref, pred in zip(references, preds)]
exact_match = sum(exact_matches) / len(exact_matches)

print("\n==== Improved Evaluation Metrics ====")
print(f"Average BLEU Score     : {avg_bleu:.4f}")
print(f"Average ROUGE-1 F1     : {avg_rouge1:.4f}")
print(f"Average ROUGE-2 F1     : {avg_rouge2:.4f}")
print(f"Average ROUGE-L F1     : {avg_rougeL:.4f}")
print(f"Macro Token-level F1   : {macro_f1:.4f}")
print(f"Exact Match Accuracy   : {exact_match:.4f}")

results_df = pd.DataFrame({
    'question': val_questions,
    'reference_sql': val_references,
    'predicted_sql': predictions
})
results_df.to_csv('rag_validation_predictions.csv', index=False)
print("Saved improved RAG validation results to rag_validation_predictions.csv.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 85.38 MiB is free. Process 143536 has 22.07 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 6.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
!pip install transformers datasets sentence-transformers faiss-cpu rouge_score nltk scikit-learn tqdm sqlparse --quiet

# Optional: mount Drive if needed
from google.colab import drive
drive.mount('/content/drive')

import os, glob, sqlite3, sqlparse
import torch, faiss, numpy as np, pandas as pd, nltk
from tqdm import tqdm
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")

# ===== PATHS (EDIT THESE) =====
model_path = "/content/drive/MyDrive/TextToSQL_Project/t5-spider-finetuned-final"
spider_db_dir = "/content/drive/MyDrive/TextToSQL_Project/spider/database"

# ===== LOAD MODEL =====
model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)
print("[INFO] Loaded model and tokenizer")

# ===== BUILD RETRIEVAL CORPUS =====
retrieval_documents = []
def safe_exec(cursor, query):
    try:
        cursor.execute(query)
        return cursor.fetchall()
    except:
        return []

for db_dir in glob.glob(f"{spider_db_dir}/*"):
    db_name = os.path.basename(db_dir)
    sqlite_files = [f for f in os.listdir(db_dir) if f.endswith(".sqlite")]
    for sqlite_file in sqlite_files:
        db_path = os.path.join(db_dir, sqlite_file)
        try:
            conn = sqlite3.connect(db_path)
            cursor = conn.cursor()
            tables = [row[0] for row in safe_exec(cursor, "SELECT name FROM sqlite_master WHERE type='table'")]
            for table in tables:
                cols = safe_exec(cursor, f"PRAGMA table_info({table})")
                col_desc = ", ".join([f"{c[1]} ({c[2]})" for c in cols])
                retrieval_documents.append(f"Database: {db_name}. Table: {table}. Columns: {col_desc}.")
            for table in tables:
                fks = safe_exec(cursor, f"PRAGMA foreign_key_list({table})")
                for fk in fks:
                    _, _, ref_table, from_col, to_col, *_ = fk
                    retrieval_documents.append(f"Join: {db_name}.{table}.{from_col} -> {db_name}.{ref_table}.{to_col}")
            conn.close()
        except Exception as e:
            print(f"[WARN] Failed to parse {db_path}: {e}")

retrieval_documents += [
    "Relationship: Students.student_id -> Enrollments.student_id; Enrollments.course_id -> Courses.course_id.",
    "Relationship: Accounts.account_id -> Transactions.account_id."
]
print(f"[INFO] Built retrieval corpus with {len(retrieval_documents)} documents")

# ===== EMBED AND INDEX =====
embedder = SentenceTransformer("all-mpnet-base-v2")
doc_embeddings = embedder.encode(retrieval_documents, convert_to_numpy=True)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)
print(f"[INFO] FAISS index built with {index.ntotal} documents")

def retrieve(query, k=2):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    _, indices = index.search(query_emb, k)
    return [retrieval_documents[i] for i in indices[0]]

def find_schema_context(question, k=2):
    return " ; ".join(retrieve(question, k))

# ===== LOAD DATASET =====
spider = load_dataset("xlangai/spider")
train_split = spider["train"]
val_split = spider["validation"]

# ===== PREPROCESS =====
def preprocess_function(examples, k=2):
    inputs = []
    for q in examples["question"]:
        schema_ctx = find_schema_context(q, k)
        inputs.append(f"translate English to SQL: Question: {q.strip()} Schema: {schema_ctx} SQL:")
    targets = [sql.strip() for sql in examples["query"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("[INFO] Tokenizing dataset...")
tokenized_train = train_split.map(lambda x: preprocess_function(x, k=2), batched=True)
tokenized_val = val_split.map(lambda x: preprocess_function(x, k=2), batched=True)
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ===== TRAINING =====
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-rag-compact",
    num_train_epochs=10,
    learning_rate=2e-5,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="steps",
    eval_steps=800,
    save_steps=800,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=200,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,  # New parameter name
)


print("[INFO] Starting training...")
trainer.train()
trainer.save_model("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-rag-compact")
tokenizer.save_pretrained("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-rag-compact")

# ===== RELOAD MODEL =====
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-rag-compact").to(device)
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/TextToSQL_Project/t5-finetuned-rag-compact")

# ===== SQL UTILITIES =====
def normalize_sql(sql):
    try:
        return sqlparse.format(sql, keyword_case="upper", reindent=True, strip_comments=True).strip()
    except:
        return sql.strip().upper()

def execute_sql_on_db(db_path, sql):
    try:
        conn = sqlite3.connect(db_path)
        conn.text_factory = str
        cursor = conn.cursor()
        cursor.execute(sql)
        result = cursor.fetchall()
        conn.close()
        return result
    except Exception as e:
        return f"[ERROR] {e}"

# ===== MAP DB PATHS =====
db_paths = {}
for db_dir in glob.glob(f"{spider_db_dir}/*"):
    db_name = os.path.basename(db_dir)
    sqlite_files = [f for f in os.listdir(db_dir) if f.endswith(".sqlite")]
    if sqlite_files:
        db_paths[db_name] = os.path.join(db_dir, sqlite_files[0])

# ===== GENERATION FUNCTION =====
def generate_sql_for_question(question, k, num_beams, length_penalty, repetition_penalty=1.5, max_len=180):
    schema_ctx = find_schema_context(question, k)
    input_text = f"translate English to SQL: Question: {question.strip()} Schema: {schema_ctx} SQL:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)
    output_ids = model.generate(
        inputs.input_ids,
        max_length=max_len,
        num_beams=num_beams,
        early_stopping=True,
        no_repeat_ngram_size=3,
        repetition_penalty=repetition_penalty,
        length_penalty=length_penalty,
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ===== PARAMETER SWEEP =====
val_questions = val_split['question']
val_references = val_split['query']
val_db_ids = val_split['db_id']

test_settings = [
    {"k": 2, "num_beams": 6, "length_penalty": 1.2},
    {"k": 2, "num_beams": 5, "length_penalty": 1.0},
    {"k": 3, "num_beams": 6, "length_penalty": 1.2},
]

best_exec_acc = -1.0
best_bleu = -1.0
best_preds = None
best_refs = None
best_setting = None

print("[INFO] Running parameter sweep...")
for setting in test_settings:
    predictions = []
    references_norm = []
    exec_correct = 0
    total_exec = 0

    for q, gold_sql, db_id in tqdm(zip(val_questions, val_references, val_db_ids), total=len(val_questions)):
        pred_sql = generate_sql_for_question(q, setting["k"], setting["num_beams"], setting["length_penalty"])
        pred_sql_norm = normalize_sql(pred_sql)
        gold_sql_norm = normalize_sql(gold_sql)

        predictions.append(pred_sql_norm)
        references_norm.append(gold_sql_norm)

        db_path = db_paths.get(db_id)
        if db_path:
            gold_result = execute_sql_on_db(db_path, gold_sql_norm)
            pred_result = execute_sql_on_db(db_path, pred_sql_norm)
            if not isinstance(gold_result, str) and not isinstance(pred_result, str):
                total_exec += 1
                if gold_result == pred_result:
                    exec_correct += 1

    exec_acc = exec_correct / total_exec if total_exec > 0 else 0.0

    smoothie = SmoothingFunction().method4
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
                   for ref, pred in zip(references_norm, predictions)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"[INFO] Settings {setting} -> ExecAcc={exec_acc:.4f} on {total_exec} queries, BLEU={avg_bleu:.4f}")

    if (exec_acc > best_exec_acc) or (exec_acc == best_exec_acc and avg_bleu > best_bleu):
        best_exec_acc = exec_acc
        best_bleu = avg_bleu
        best_preds = predictions
        best_refs = references_norm
        best_setting = setting

# ===== FINAL EVALUATION =====
print(f"\n[RESULT] Best settings: {best_setting}")
print(f"Best Execution Accuracy: {best_exec_acc:.4f}")
print(f"Best BLEU: {best_bleu:.4f}")

# ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_sum = rouge2_sum = rougeL_sum = 0
for ref, pred in zip(best_refs, best_preds):
    scores = scorer.score(ref, pred)
    rouge1_sum += scores['rouge1'].fmeasure
    rouge2_sum += scores['rouge2'].fmeasure
    rougeL_sum += scores['rougeL'].fmeasure

n = len(best_preds)
avg_rouge1 = rouge1_sum / n
avg_rouge2 = rouge2_sum / n
avg_rougeL = rougeL_sum / n

# Macro F1
ref_tokens = [ref.split() for ref in best_refs]
pred_tokens = [pred.split() for pred in best_preds]
all_tokens = set()
for tokens in ref_tokens + pred_tokens:
    all_tokens.update(tokens)

mlb = MultiLabelBinarizer(classes=list(all_tokens))
ref_binary = mlb.fit_transform([set(tokens) for tokens in ref_tokens])
pred_binary = mlb.transform([set(tokens) for tokens in pred_tokens])
macro_f1 = f1_score(ref_binary, pred_binary, average='macro')

# Exact Match
exact_matches = [ref == pred for ref, pred in zip(best_refs, best_preds)]
exact_match = sum(exact_matches) / len(exact_matches)

print("\n==== Final Evaluation Results ====")
print(f"Execution Accuracy : {best_exec_acc:.4f}")
print(f"Average BLEU       : {best_bleu:.4f}")
print(f"ROUGE-1 F1         : {avg_rouge1:.4f}")
print(f"ROUGE-2 F1         : {avg_rouge2:.4f}")
print(f"ROUGE-L F1         : {avg_rougeL:.4f}")
print(f"Macro Token F1     : {macro_f1:.4f}")
print(f"Exact Match        : {exact_match:.4f}")

# Save results
results_df = pd.DataFrame({
    'question': val_questions,
    'reference_sql': val_references,
    'predicted_sql': best_preds
})
results_df.to_csv('rag_validation_predictions_best.csv', index=False)
print("[INFO] Results saved to rag_validation_predictions_best.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[INFO] Using device: cuda
[INFO] Loaded model and tokenizer
[INFO] Built retrieval corpus with 1678 documents


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[INFO] FAISS index built with 1678 documents
[INFO] Tokenizing dataset...


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1034 [00:00<?, ? examples/s]

[INFO] Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mmrrishabhfree[0m ([33mmrrishabhfree-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
