In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
import torch
import pandas as pd
import json
from torch.utils.data import DataLoader, Dataset

from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
from transformers import default_data_collator, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shahrukhkhan/wikisql")

print("Path to dataset files:", path)

Path to dataset files: /home/oumar/.cache/kagglehub/datasets/shahrukhkhan/wikisql/versions/2


In [14]:
dataset = load_dataset(path)
data_train = dataset['train'].shuffle(seed=42).select(range(20000))
data_test = dataset['test'].shuffle(seed=42).select(range(100))
data_valid = dataset['validation']

data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)
data_valid = pd.DataFrame(data_valid)

data_train.head()

Unnamed: 0,question,answer,id
0,Which sum of week that had an attendance large...,SELECT SUM Week FROM table WHERE Attendance > ...,bd28c679552f44cfad548ab32a60c5d1
1,WHAT IS THE POINTS WITH 55 TRIES?,SELECT Points FROM table WHERE Tries for = 55,f848519ba4e44968b135294e2d707922
2,"What is 2nd Member, when Assembled is ""30 Marc...",SELECT 2nd member FROM table WHERE Assembled =...,e1153462ca3642fb999f28f476fb9d88
3,What is the title of the king who left office ...,SELECT Title FROM table WHERE Left office = 98...,e89fd713b65f4b6d94d9755674fac9ce
4,What is the address for the assistant principa...,SELECT Address FROM table WHERE Assistant Prin...,9839ab63525c463fa69f4e66d7d8d2bf


In [15]:
def preprocess_data(data):
    data = data.dropna()
    data = data.drop_duplicates()
    data = data.drop(columns=["id"])
    data = data.reset_index(drop=True)
    return data

In [16]:
data_train = preprocess_data(data_train)
data_test = preprocess_data(data_test)
data_valid = preprocess_data(data_valid)

data_train.head()

Unnamed: 0,question,answer
0,Which sum of week that had an attendance large...,SELECT SUM Week FROM table WHERE Attendance > ...
1,WHAT IS THE POINTS WITH 55 TRIES?,SELECT Points FROM table WHERE Tries for = 55
2,"What is 2nd Member, when Assembled is ""30 Marc...",SELECT 2nd member FROM table WHERE Assembled =...
3,What is the title of the king who left office ...,SELECT Title FROM table WHERE Left office = 98...
4,What is the address for the assistant principa...,SELECT Address FROM table WHERE Assistant Prin...


In [17]:
questions = data_train['question'].tolist()
sql_queries = data_train['answer'].tolist()

test_questions = data_test['question'].tolist()
test_sql_queries = data_test['answer'].tolist()

valid_questions = data_valid['question'].tolist()
valid_sql_queries = data_valid['answer'].tolist()

In [7]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# OOV token
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id


# padding side
tokenizer.pad_padding_side = "right"

tokenized_input = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")
tokenized_output = tokenizer(sql_queries, padding=True, truncation=True, return_tensors="pt")



device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)

In [None]:
# PEFT
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)

In [None]:
# num of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Number of trainable parameters {count_parameters(model):,}')

In [18]:
class SQLDataset(Dataset):
    def __init__(self, tokenizer, questions, sql_queries, max_length):
        self.tokenizer = tokenizer
        self.input_sequence = questions
        self.target = sql_queries
        self.max_length = max_length

    def __len__(self):
        return len(self.input_sequence)
    
    def __getitem__(self, idx):
        input_sequence = self.input_sequence[idx]
        target = self.target[idx]
        tokenized_input = self.tokenizer(input_sequence, padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        tokenized_output = self.tokenizer(target, padding="max_length", truncation=True, return_tensors="pt", max_length = self.max_length)

        return {
            "input_ids": tokenized_input.input_ids.squeeze(0),
            "attention_mask": tokenized_input.attention_mask.squeeze(0),
            "labels": tokenized_output.input_ids.squeeze(0),
            "decoder_attention_mask": tokenized_output.attention_mask.squeeze(0),
            "query": target
        }

In [19]:
train_dataset = SQLDataset(tokenizer, questions, sql_queries, 512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=default_data_collator, pin_memory=True)

test_dataset = SQLDataset(tokenizer, test_questions, test_sql_queries, 512)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=default_data_collator, pin_memory=True)

valid_dataset = SQLDataset(tokenizer, valid_questions, valid_sql_queries, 512)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=True, collate_fn=default_data_collator, pin_memory=True)

In [None]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',           
    logging_steps=10,                
    eval_strategy="epoch",     
    report_to='none'                
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

trainer.train()

# Save model
model.save_pretrained("sql_model")

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_tr = AutoModelForSeq2SeqLM.from_pretrained("nl2sql_model")
tokenizer_tr = AutoTokenizer.from_pretrained("nl2sql_model")

model_tr

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1024, bias=False)
              )
              (lora_embedding

In [11]:
question = "What is the capital of France?"
inputs = tokenizer_tr(question, return_tensors="pt")
outputs = model_tr.generate(**inputs)
sql_query = tokenizer_tr.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {question}")
print(f"SQL Query: {sql_query}")



Question: What is the capital of France?
SQL Query: SELECT Capital FROM table WHERE Country = france


In [None]:
def generate_queries(model, tokenizer, dataset, batch_size = 8):
    model.eval()

    queries = []
    references = []
    dataloader = DataLoader(dataset, batch_size=batch_size, pin_memory=True)

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)

            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_beams=5)
            sql_query = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
            queries.extend(sql_query)
            references.extend(batch['query'])
    return queries, references

In [21]:
queries, references = generate_queries(model_tr, tokenizer_tr, test_dataset)

In [29]:
import evaluate

rouge = evaluate.load("rouge")
blue = evaluate.load("bleu")

rouge_score = rouge.compute(predictions=queries, references=references)
print(rouge_score)

Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 5.15MB/s]
Downloading extra modules: 4.07kB [00:00, 1.73MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 4.78MB/s]


{'rouge1': np.float64(0.8909919454498688), 'rouge2': np.float64(0.750587669874629), 'rougeL': np.float64(0.8629469773551725), 'rougeLsum': np.float64(0.8621255664342888)}


In [30]:
blue_score = blue.compute(predictions=queries, references=references)
print(blue_score)


{'bleu': 0.5980681639167751, 'precisions': [0.8743500866551126, 0.70521327014218, 0.5910041841004184, 0.4982497082847141], 'brevity_penalty': 0.9161990231034917, 'length_ratio': 0.9195219123505977, 'translation_length': 1154, 'reference_length': 1255}


In [31]:
exact_match = evaluate.load("exact_match")

exact_match = exact_match.compute(predictions=queries, references=references)
print(exact_match)

Downloading builder script: 100%|██████████| 5.67k/5.67k [00:00<00:00, 2.49MB/s]

{'exact_match': np.float64(0.21)}





In [None]:
{'bleu': 0.5980681639167751, 'precisions': [0.8743500866551126, 0.70521327014218, 0.5910041841004184, 0.4982497082847141], 'brevity_penalty': 0.9161990231034917, 'length_ratio': 0.9195219123505977, 'translation_length': 1154, 'reference_length': 1255}
