In [None]:
!pip install transformers datasets torch
!pip install accelerate -U
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import random

# Load a small subset (20%) for training
dataset = load_dataset('b-mc2/sql-create-context', split="train")
subset_size = int(len(dataset) * 0.2)
dataset = dataset.select(range(subset_size))

# Split the dataset into train and validation sets manually
train_size = int(len(dataset) * 0.8)  # 80% for training
train_dataset = Dataset.from_dict(dataset[:train_size])
val_dataset = Dataset.from_dict(dataset[train_size:])

# Initialize the tokenizer and model from the pre-trained T5 model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Define a function to preprocess the data
def preprocess_function(examples):
    # Combine question and schema into a single input string
    inputs = [q + " [schema] " + s for q, s in zip(examples['question'], examples['context'])]
    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(examples['answer'], max_length=512, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocess function to the datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
    save_total_limit=1,              # Limit the total amount of checkpoints
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                      # The model to be trained
    args=training_args,               # Training arguments
    train_dataset=tokenized_train,  # Training dataset
    eval_dataset=tokenized_val  # Evaluation dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('text-to-sql-model')
tokenizer.save_pretrained('text-to-sql-model')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Load the trained model and tokenizer
model_path = 'text-to-sql-model'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def generate_sql(question, schema):
    # Combine question and schema into a single input string
    input_text = question + " [schema] " + schema
    # Tokenize the input
    inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    # Generate SQL query
    outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
    # Decode the output
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return sql_query

# Example usage
question = "How many houses are there in New York?"
schema = "CREATE TABLE houses (id INTEGER, address VARCHAR, city VARCHAR)"
sql_query = generate_sql(question, schema)
print(sql_query)

SELECT COUNT(*) FROM houses WHERE address = "New York" AND city = "New York"


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Save the model to Google Drive
!cp -r text-to-sql-model "/content/drive/My Drive/"

Mounted at /content/drive
