## Finetuning OpenAI

### Imports

In [21]:
import json
import re
import os
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [80]:
def prepare_data(data):

    new_data = []
    data_length = len(data)
    print("DATA LENGTH: ",data_length)
    for i in range(data_length-4): # since we have next 3 events as the wrong choices, and next entities as right choices
        correct_set = data[i]

        # remove all punctuation except periods, commas, and apostrophes
        pattern = r'[^\w\s.,\']'

        total_context = re.sub(pattern, '', correct_set['total_context']) if correct_set['total_context'] else ""


        # remove multiple spaces (causing issues with tiktoken)
        event_desc_i = re.sub(pattern, '', data[i+1]['event']['description']) if data[i+1]['event']['description'] else ""
        character_desc_i = re.sub(pattern, '',  data[i+1]['character']['description']) if data[i+1]['character']['description'] else ""
        place_desc_i = re.sub(pattern, '',  data[i+1]['place']['description']) if data[i+1]['place']['description'] else ""

        event_desc_i_next = re.sub(pattern, '', data[i+2]['event']['description']) if data[i+2]['event']['description'] else ""
        character_desc_i_next = re.sub(pattern, '', data[i+2]['character']['description']) if data[i+2]['character']['description'] else ""
        place_desc_i_next = re.sub(pattern, '', data[i+2]['place']['description']) if data[i+2]['place']['description'] else ""

        event_desc_i_next2 = re.sub(pattern, '', data[i+3]['event']['description']) if data[i+3]['event']['description'] else ""
        character_desc_i_next2 = re.sub(pattern, '', data[i+3]['character']['description']) if data[i+3]['character']['description'] else ""
        place_desc_i_next2 = re.sub(pattern, '', data[i+3]['place']['description']) if data[i+3]['place']['description'] else ""

        event_desc_i_next3 = re.sub(pattern, '', data[i+4]['event']['description']) if data[i+4]['event']['description'] else ""
        character_desc_i_next3 = re.sub(pattern, '', data[i+4]['character']['description']) if data[i+4]['character']['description'] else ""
        place_desc_i_next3 = re.sub(pattern, '', data[i+4]['place']['description']) if data[i+4]['place']['description'] else ""


        new_data.append({
            "messages": [
            {
                "role": "system",
                "content": total_context,
            }, 
            {
                "role": "user",
                "content": "What is the next event in this context? Who is the main character in that event? What is the scene location?"
            },

            # correct answer set
            {
                "role": "assistant",
                "content": 
                  f"<event_name>{data[i+1]['event']['name']}</event_name>"
                  f"<event_description>{event_desc_i}</event_description>"
                  f"<character_name>{data[i+1]['character']['name']}</character_name>"
                  f"<character_description>{character_desc_i}</character_description>"
                  f"<place_name>{data[i+1]['place']['name']}</place_name>"
                  f"<place_description>{place_desc_i}</place_description>"
                ,
                "weight": 1
            },

            # incorrect answer sets
            {
                "role": "assistant",
                "content": 
                    f"<event_name>{data[i+2]['event']['name'].strip()}</event_name>"
                    f"<event_description>{event_desc_i_next}</event_description>"
                    f"<character_name>{data[i+2]['character']['name'].strip()}</character_name>"
                    f"<character_description>{character_desc_i_next}</character_description>"
                    f"<place_name>{data[i+2]['place']['name'].strip()}</place_name>"
                    f"<place_description>{place_desc_i_next}</place_description>"
                ,
                "weight": 0
            },
            {
                "role": "assistant",
                "content": 
                   f"<event_name>{data[i+3]['event']['name']}</event_name>"
                   f"<event_description>{event_desc_i_next2}</event_description>"
                   f"<character_name>{data[i+3]['character']['name']}</character_name>"
                   f"<character_description>{character_desc_i_next2}</character_description>"
                   f"<place_name>{data[i+3]['place']['name']}</place_name>"
                   f"<place_description>{place_desc_i_next2}</place_description>"
                ,
                "weight": 0
            },
            {
                "role": "assistant",
                "content": 
                    f"<event_name>{data[i+4]['event']['name']}</event_name>"
                    f"<event_description>{event_desc_i_next3}</event_description>"
                    f"<character_name>{data[i+4]['character']['name']}</character_name>"
                    f"<character_description>{character_desc_i_next3}</character_description>"
                    f"<place_name>{data[i+4]['place']['name']}</place_name>"
                    f"<place_description>{place_desc_i_next3}</place_description>"
                ,
                "weight": 0
            }]})
    return new_data

### Make training data as required by OpenAI

In [128]:
# bert_dataset_folder = "./bert_train_dataset"
bert_train_dataset_folder = "./bert_sample"
openai_train_dataset_folder = "./openai_train_dataset"

if not os.path.exists(openai_train_dataset_folder):
    os.makedirs(openai_train_dataset_folder)
    print("Directory 'openai_train_dataset' created successfully.")

for filename in os.listdir(bert_train_dataset_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(bert_train_dataset_folder, filename)
        new_file_path = os.path.join(openai_train_dataset_folder, filename.replace("_bert_data.json", "_openai_data.jsonl"))
        
        with open(file_path, 'r') as file:
            prev_data = json.load(file)
            new_data = prepare_data(prev_data)
        
        if new_data is not None:
            with open(new_file_path, "w") as jsonl_file:
                for item in new_data:
                    json.dump(item, jsonl_file)
                    jsonl_file.write('\n')

        print(f"File '{new_file_path}' created successfully.")

DATA LENGTH:  79
File './openai_train_dataset/0pzw4q_openai_data.jsonl' created successfully.
DATA LENGTH:  11
File './openai_train_dataset/1mpsvd_openai_data.jsonl' created successfully.
DATA LENGTH:  26
File './openai_train_dataset/1kynba_openai_data.jsonl' created successfully.
DATA LENGTH:  59
File './openai_train_dataset/0tvryd_openai_data.jsonl' created successfully.
DATA LENGTH:  19
File './openai_train_dataset/1cq700_openai_data.jsonl' created successfully.
DATA LENGTH:  38
File './openai_train_dataset/0rz2vg_openai_data.jsonl' created successfully.
DATA LENGTH:  21
File './openai_train_dataset/0d4d57_openai_data.jsonl' created successfully.
DATA LENGTH:  59
File './openai_train_dataset/0qy2qe_openai_data.jsonl' created successfully.
DATA LENGTH:  24
File './openai_train_dataset/0dgp9x_openai_data.jsonl' created successfully.
DATA LENGTH:  33
File './openai_train_dataset/0gzcr1_openai_data.jsonl' created successfully.
DATA LENGTH:  15
File './openai_train_dataset/1rnyh8_openai_

### Verify data format correctness

Verify if data is according to the format required by OpenAI for finetuning GPT-3.5

In [None]:
openai_train_dataset_folder = "./openai_train_dataset"
files = os.listdir(openai_train_dataset_folder)

if files:
    first_file_path = os.path.join(openai_train_dataset_folder, files[0])

    # Load the dataset from the first file
    with open(first_file_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Initial dataset stats
    print("Num examples:", len(dataset))
    print("First example:")
    for message in dataset[0]["messages"]:
        print(message)
else:
    print("No files found in the folder:", openai_train_dataset_folder)


In [130]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


### Token Counting

In [131]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            if key == "weight":
                    continue
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [132]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 6, 6
mean / median: 6.0, 6.0
p5 / p95: 6.0, 6.0

#### Distribution of num_total_tokens_per_example:
min / max: 21, 21
mean / median: 21.0, 21.0
p5 / p95: 21.0, 21.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 3005, 6532
mean / median: 4997.875, 4829.5
p5 / p95: 3031.6, 6529.2

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [133]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~168 tokens that will be charged for during training
By default, you'll train for 12 epochs on this dataset
By default, you'll be charged for ~2016 tokens


# Finetuning .... finally

### Uploading training files

In [None]:
from openai import OpenAI

API_KEY = "sk-proj-JYHoORcm3bg4YuzFKDNsT3BlbkFJTCr1mK3WwSSlzkPJ1eMV"

client = OpenAI(api_key=API_KEY)

In [152]:
openai_train_dataset_folder = "./openai_train_dataset"
files = os.listdir(openai_train_dataset_folder)
training_file_IDs = []

# Loop through each file path and upload the file
for file_name in files:
    file_path = os.path.join(openai_train_dataset_folder, file_name)
    if os.path.exists(file_path):
        with open(file_path, "rb") as file:
            try:
                training_file_IDs.append(client.files.create(
                    file=file,
                    purpose="fine-tune"
                ))
            except:
                continue
        print(f"File '{file_path}' uploaded successfully.")
    else:
        print(f"File '{file_path}' does not exist.")


File './openai_train_dataset/1keseb_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1t23cd_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1kynba_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/0r48tt_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/0bykeg_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/00dg9y_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1gaagh_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/0syapk_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1rnyh8_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/0kvkn5_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1qaw0a_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/1qbng7_openai_data.jsonl' uploaded successfully.
File './openai_train_dataset/0dgp9x_openai_data.jsonl' uploaded 

### Creating a sample job + checking it

We can see that the job has been successful since there are no errors in the "error" field. Uploaded one file because it took $6 per file to finetune. We can't pay anymore than we have to to show proof of work lol.

In [164]:
client.fine_tuning.jobs.create(
    training_file=training_file_IDs[0].id,
    model="gpt-3.5-turbo-0125"
)

fine_tuning_jobs = client.fine_tuning.jobs.list(limit=10)
keys = fine_tuning_jobs.data[0].__annotations__.keys()
print("Here are the keys to access the FineTuning Object: ", keys)

# see that there is no error
print(fine_tuning_jobs.data[0].error)

# get the fine-tuned model
model = fine_tuning_jobs.data[0].fine_tuned_model
if model is not None:
    completion = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": ""}, # the output of gpt-3.5 (Storyline Generation) model will be the content
        {"role": "user", "content": "What is the next event in this context? Who is the main character in that event? What is the scene location? "}
    ]
    )
answer = completion.choices[0].message.content # this answer will guide the other GPT-3.5 (Story Generation) model
print(answer)


Here are the keys to access the FineTuning Object:  dict_keys(['id', 'created_at', 'error', 'fine_tuned_model', 'finished_at', 'hyperparameters', 'model', 'object', 'organization_id', 'result_files', 'seed', 'status', 'trained_tokens', 'training_file', 'validation_file', 'estimated_finish', 'integrations'])
Error(code=None, message=None, param=None)
**Next Event**:  
The main character checks the bottom of the bowl for any leftover food.


**Main Character**:  
**Name:**  	Anna
**Role:**  	The main character


**Scene Location:**  
**Name:**  	the living room
**Description:**  	A room in the apartment with mismatched but cozy furniture. There is a large window letting in plenty of natural light.


### Note about more training data

The above code would apply with minor modifications had we sent more training files to the model. But since we don't have the resources (money) this was not done. The rest of the steps for getting the model etc stay the same.

In [None]:
# fine-tuning job for each uploaded file
# for fileObj in training_file_IDs:
#     client.fine_tuning.jobs.create(
#         training_file=fileObj.id,
#         model="gpt-3.5-turbo-0125"
#     )
#     print(f"Fine-tuning job created for file ID: {fileObj.id}")