In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wikisql/wikisql_train.json
/kaggle/input/wikisql/wikisql_test.json
/kaggle/input/wikisql/validation.csv
/kaggle/input/wikisql/train.csv
/kaggle/input/wikisql/test.csv
/kaggle/input/wikisql/wikisql_validation.json


In [None]:
df = pd.read_csv('/kaggle/input/wikisql/test.csv')
df2 = df.sample(100)

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, Dataset

In [None]:
questions = df2['question'].tolist()
sql_queries = df2['sql'].tolist()

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
tokenized_inputs = tokenizer.batch_encode_plus(questions, padding=True, truncation=True, return_tensors='pt')
tokenized_outputs = tokenizer.batch_encode_plus(sql_queries, padding=True, truncation=True, return_tensors='pt')

In [None]:
class SQLOnlineDataset(Dataset):
    def __init__(self, tokenized_inputs, tokenized_outputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = tokenized_outputs['input_ids']
        self.decoder_attention_mask = tokenized_outputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
            'decoder_attention_mask': self.decoder_attention_mask[idx]
        }


In [None]:
train_dataset = SQLOnlineDataset(tokenized_inputs, tokenized_outputs)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:

num_epochs = 10
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask,
            return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

In [None]:
model.save_pretrained('sql_t5_model')
tokenizer.save_pretrained('sql_t5_tokenizer')

In [None]:
torch.save(model.state_dict(), 'sql_model.pt')

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.load_state_dict(torch.load('sql_model.pt'))
model.eval()

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:

new_question = "What team has more than 49 laps and a grid of 8?"
input_ids = tokenizer.encode(new_question, return_tensors='pt')
outputs = model.generate(input_ids=input_ids, max_length=100, num_beams=5, early_stopping=True)
sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {new_question}")
print(f"Generated SQL query: {sql_query}")


In [None]:
df2

In [None]:
df2['']