In [None]:
import torch
from transformers import LMHeadModel, Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


model_name = "model_name"  
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

#CSV or text file with pairs of English queries and SQL queries
dataset_path = "english_to_sql_dataset.txt"

# Tokenize and prepare the dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # the block size can be adjusted
)

# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./english_to_sql_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,  #the epochs can be adjusted
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=10_000,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./english_to_sql_finetuned")

In [1]:
# import pandas as pd

# # Load the dataset from the pickle file
# file_path = 'NSS_file.pkl'
# data = pd.read_pickle(file_path)

# # Combine 'instruction' and 'output' columns for tokenization
# instructions = data['instruction'].tolist()
# outputs = data['output'].tolist()
# combined_queries = instructions + outputs

# # Tokenize and find unique tokens
# unique_tokens = set()
# for query in combined_queries:
#     # Simple tokenization based on whitespace and basic SQL syntax
#     # This will not perfectly tokenize all SQL elements (like strings with spaces, etc.)
#     tokens = query.replace('(', ' ( ').replace(')', ' ) ').replace(',', ' , ').split()
#     unique_tokens.update(tokens)

# # Output the number of unique tokens
# len(unique_tokens), unique_tokens


TypeError: list indices must be integers or slices, not str

In [2]:
import pandas as pd
# Check the type of the loaded data to understand its structure
type(data)


list

In [4]:
# Display the first few elements of the list to understand its structure
#data[:5]


In [5]:
# Since the data is a list of dictionaries with keys 'instruction' and 'output',
# we can extract these and find the unique tokens across all entries.

# Initialize an empty set to hold the unique tokens
unique_tokens_set = set()

# Function to tokenize a SQL query
def tokenize_sql(sql):
    # A very basic tokenizer splitting on spaces, not accounting for string literals or special characters
    return sql.replace('(', ' ( ').replace(')', ' ) ').replace(',', ' , ').split()

# Loop through each entry and update the set of unique tokens
for entry in data:
    if 'instruction' in entry and 'output' in entry:
        # Tokenize the 'instruction' and 'output'
        instruction_tokens = tokenize_sql(entry['instruction'])
        output_tokens = tokenize_sql(entry['output'])
        # Update the unique tokens set
        unique_tokens_set.update(instruction_tokens)
        unique_tokens_set.update(output_tokens)

# Now we have a set of unique tokens
len(unique_tokens_set), list(unique_tokens_set)[:10]  # Show the number of unique tokens and the first 10 as a sample


(358931,
 ['table_44088',
  "'i''m",
  'Approaches',
  'Warriors?',
  'FirstName',
  "'0643879'",
  '"Sector',
  "nahuatl'",
  '"Judy',
  "häkkinen'"])