In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, get_scheduler
from datasets import load_dataset
import torch
import pandas as pd
import json
from torch.utils.data import DataLoader, Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shahrukhkhan/wikisql")

print("Path to dataset files:", path)

In [None]:
dataset = load_dataset(path)
data_train = dataset['train']
data_test = dataset['test']
data_valid = dataset['validation']

data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)
data_valid = pd.DataFrame(data_valid)

data_train.head()


In [None]:
def preprocess_data(data):
    data = data.dropna()
    data = data.drop_duplicates()
    data = data.drop(columns=["id"])
    data = data.reset_index(drop=True)
    return data

In [None]:
data_train = preprocess_data(data_train)
data_test = preprocess_data(data_test)
data_valid = preprocess_data(data_valid)

data_train.head()

In [None]:
questions = data_train['question'].tolist()
sql_queries = data_train['answer'].tolist()

test_questions = data_test['question'].tolist()
test_sql_queries = data_test['answer'].tolist()

valid_questions = data_valid['question'].tolist()
valid_sql_queries = data_valid['answer'].tolist()

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# OOV token
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id


# padding side
tokenizer.pad_padding_side = "right"

tokenized_input = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")
tokenized_output = tokenizer(sql_queries, padding=True, truncation=True, return_tensors="pt")



device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)

In [None]:
class SQLDataset(Dataset):
    def __init__(self, tokenizer, questions, sql_queries, max_length):
        self.tokenizer = tokenizer
        self.input_sequence = questions
        self.target = sql_queries
        self.max_length = max_length

    def __len__(self):
        return len(self.input_sequence)
    
    def __getitem__(self, idx):
        input_sequence = self.input_sequence[idx]
        target = self.target[idx]
        tokenized_input = self.tokenizer(input_sequence, padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        tokenized_output = self.tokenizer(target, padding="max_length", truncation=True, return_tensors="pt", max_length = self.max_length)

        return {
            "input_ids": tokenized_input.input_ids.squeeze(0),
            "attention_mask": tokenized_input.attention_mask.squeeze(0),
            "labels": tokenized_output.input_ids.squeeze(0),
            "decoder_attention_mask": tokenized_output.attention_mask.squeeze(0)
        }

In [None]:
train_dataset = SQLDataset(tokenizer, questions, sql_queries, 512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = SQLDataset(tokenizer, test_questions, test_sql_queries, 512)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

valid_dataset = SQLDataset(tokenizer, valid_questions, valid_sql_queries, 512)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=True)

In [None]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# num_training_steps = num_epochs * len(train_dl)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

# progress_bar = tqdm(range(num_training_steps))
# num_epochs = 10
# for epoch in range(num_epochs):
#     for batch in train_loader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

#     print(f"Epoch {epoch} Loss: {loss.item()}")

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',           
    logging_steps=10,                
    evaluation_strategy="epoch",     
    report_to='none'                
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

trainer.train()

# Save model
model.save_pretrained("sql_model")

In [None]:
import evaluate

metric = evaluate.load()