In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/wikisql/wikisql_schema_train.csv')
df2 = df.sample(100)

In [3]:
questions = df2['question'].tolist()
sql_queries = df2['sql'].tolist()

In [4]:
questions

['What is the Name of the power station with a Capacity of 25 MW?',
 'What was the corrected time of sail number aus70?',
 'Which Nationality has a Player of keith bogans, and a Years in Orlando of 2006–2009?',
 'What grade did jockey Robby Albarado get when racing with Ravens Pass?',
 'In what place(s) did the player(s) with a total less than 282 finish?',
 'What is the number of games for less than 2 seasons and more than 7 draws?',
 'What is every 5-year peek with a when Anatoly Karpov, 2820 is 15-year peak?',
 'What is the rating for the episode with the night rank of 11 and timeslot rank is larger than 4?',
 'What was the percentage of total votes in 1997?',
 'Which Away team has a Tie no of 4?',
 'What is the earliest year Stuart Janney III was an owner?',
 'What is GDP (PPP) Per Capita 2012 EU27 = 100, when GDP 2012 Millions Of Euro is 309,900?',
 'What are the total lanes that have a rank larger than 22?',
 'Which Melbourne had a gold coast and sydney which were yes, but an ade

In [5]:
sql_queries

['SELECT Name FROM table WHERE Capacity (MW) = 25',
 'SELECT Corrected Time d:hh:mm:ss FROM table WHERE Sail Number = AUS70',
 'SELECT Nationality FROM table WHERE Player = keith bogans AND Years in Orlando = 2006–2009',
 'SELECT Grade FROM table WHERE Jockey = robby albarado AND Runner up/Winner = ravens pass',
 'SELECT Finish FROM table WHERE Total < 282',
 'SELECT SUM total games FROM table WHERE number of seasons < 2 AND Draw > 7',
 'SELECT 5-year peak FROM table WHERE 15-year peak = Anatoly Karpov, 2820',
 'SELECT AVG Rating FROM table WHERE Night Rank = 11 AND Timeslot Rank > 4',
 'SELECT % of total vote FROM table WHERE Year = 1997',
 'SELECT Away team FROM table WHERE Tie no = 4',
 'SELECT MIN Year FROM table WHERE Owner = stuart janney iii',
 'SELECT GDP (PPP) per capita 2012 EU27 = 100 FROM table WHERE GDP 2012 millions of euro = 309,900',
 'SELECT SUM Lane FROM table WHERE Rank > 22',
 'SELECT Melbourne FROM table WHERE Gold Coast = yes AND Adelaide = no AND Sydney = yes',
 

In [19]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
tokenized_inputs = tokenizer.batch_encode_plus(questions, padding=True, truncation=True, return_tensors='pt')
tokenized_outputs = tokenizer.batch_encode_plus(sql_queries, padding=True, truncation=True, return_tensors='pt')

In [20]:
class SQLOnlineDataset(Dataset):
    def __init__(self, tokenized_inputs, tokenized_outputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = tokenized_outputs['input_ids']
        self.decoder_attention_mask = tokenized_outputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
            'decoder_attention_mask': self.decoder_attention_mask[idx]
        }


In [21]:
train_dataset = SQLOnlineDataset(tokenized_inputs, tokenized_outputs)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [23]:
optimizer = AdamW(model.parameters(), lr=3e-5)

In [24]:

num_epochs = 10
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask,
            return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

Epoch: 1, Loss: 1.234610676765442
Epoch: 2, Loss: 0.9591853618621826
Epoch: 3, Loss: 0.935570478439331
Epoch: 4, Loss: 0.5132601857185364
Epoch: 5, Loss: 0.8935773372650146
Epoch: 6, Loss: 0.4368203580379486
Epoch: 7, Loss: 0.4976617097854614
Epoch: 8, Loss: 0.3810380697250366
Epoch: 9, Loss: 0.45266959071159363
Epoch: 10, Loss: 0.5406204462051392


In [25]:
model.save_pretrained('model/wikisql_t5_model')
tokenizer.save_pretrained('model/sql_t5_tokenizer')

('model/sql_t5_tokenizer\\tokenizer_config.json',
 'model/sql_t5_tokenizer\\special_tokens_map.json',
 'model/sql_t5_tokenizer\\spiece.model',
 'model/sql_t5_tokenizer\\added_tokens.json')

In [26]:
torch.save(model.state_dict(), 'model/wikisql_model.pt')

In [27]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.load_state_dict(torch.load('model/wikisql_model.pt'))
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [28]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [29]:

new_question = "How many heads of the departments are older than 56 ?"
input_ids = tokenizer.encode(new_question, return_tensors='pt')
outputs = model.generate(input_ids=input_ids, max_length=100, num_beams=5, early_stopping=True)
sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {new_question}")
print(f"Generated SQL query: {sql_query}")


Question: How many heads of the departments are older than 56 ?
Generated SQL query: How many heads of departments are older than 56?


In [33]:
input_ids

tensor([[  571,   186,  7701,    13,     8, 10521,    33,  2749,   145, 11526,
             3,    58,     1]])