In [1]:
import json
from openai import OpenAI
import os
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv

load_dotenv()

client = OpenAI()

Load the data


In [2]:
sql_data = pd.read_parquet("train-00000-of-00001.parquet")

create function to prepare the data


In [6]:
system_message = "You are a helpful SQL coding assistant, you are to form SQL queries based on questions asked"


def prepare_example_conversation(row):
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": row["question"]},
            {"role": "assistant", "content": row["query"]},
        ]
    }


pprint(prepare_example_conversation(sql_data.iloc[0]))

{'messages': [{'content': 'You are a helpful SQL coding assistant, you are to '
                          'form SQL queries based on questions asked',
               'role': 'system'},
              {'content': 'How many heads of the departments are older than 56 '
                          '?',
               'role': 'user'},
              {'content': 'SELECT count(*) FROM head WHERE age  >  56',
               'role': 'assistant'}]}


In [None]:
print(sql_data.shape)
train_data = sql_data.iloc[0:5600]
print(train_data.shape)
validation_data = sql_data.iloc[5600:]
print(validation_data.shape)

train_data = train_data.apply(prepare_example_conversation, axis=1).to_list()
validation_data = validation_data.apply(prepare_example_conversation, axis=1).to_list()
pass

(7000, 6)
(5600, 6)
(1400, 6)


write to json files


In [24]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)


training_file_name = "training.jsonl"
write_jsonl(train_data, training_file_name)

validation_file_name = "validation.jsonl"
write_jsonl(validation_data, validation_file_name)

In [45]:
def upload_file(file_name: str, purpose: str) -> str:
    with open(file_name, "rb") as file_fd:
        response = client.files.create(file=file_fd, purpose=purpose)
    return response.id


training_file_id = upload_file(training_file_name, "fine-tune")
validation_file_id = upload_file(validation_file_name, "fine-tune")

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-MuCMdHXPKfN7ayZPBYpTaY
Validation file ID: file-Qh8Y1jMvNzkoXwsNG7L2uR


In [47]:
MODEL = "gpt-3.5-turbo"

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model=MODEL,
    suffix="recipe-ner",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-Fq9LSZ9miWheThL6Fr9mie8f
Status: validating_files


In [48]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)

Job ID: ftjob-Fq9LSZ9miWheThL6Fr9mie8f
Status: validating_files
Trained Tokens: None


In [44]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Created fine-tuning job: ftjob-HarNc5RHL68iqgIfH46rU5Ur
Validating training file: file-HvUwgMUoNQxByeWgySnKpE and validation file: file-WopdyeHHBmdX31HLpHe3tJ
