# Install and import packages


Before running the code:

First, make sure you have alreay exported your OPENAI API key as an environment variable in your terminal.

Second, run the following code in the terminal to install openai and the required packages (if this hasn't been done): 
```bash
python.exe -m pip install --upgrade pip   

pip install openai     

pip install -r install_packages.txt 
```

In [1]:
# Import the necessary packages
from openai import OpenAI
import csv
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict
import requests
import chardet # to detect the encoding of the csv file to be converted

# Prepare datasets

First, convert the csv training data, validation data and test data into jsonl file, using the function below.

In [None]:
# function to convert a csv file into a jsonl file, as the gpt model to be fine-tuned requires jsonl file format as input
def csv_to_jsonl(input_csv_path, output_jsonl_path):
    # First detect the encoding
    with open(input_csv_path, mode='rb') as rawfile:
        result = chardet.detect(rawfile.read())
        encoding = result['encoding']

    # Open the CSV file and create the JSONL file
    with open(input_csv_path, mode='r', encoding=encoding) as csv_file, \
        open(output_jsonl_path, mode='w', encoding='utf-8') as jsonl_file:
        
        # Create a CSV DictReader
        csv_reader = csv.DictReader(csv_file)
        
        # Loop through each row in the CSV
        for row in csv_reader:
            # Create the structured data for a single conversation
            structured_data = {
                "messages": [
                    {"role": "system", "content": row['system']},
                    {"role": "user", "content": row['user']},
                    {"role": "assistant", "content": row['assistant']}
                ]
            }
            
            # Convert the structured data to a JSON string and write it to the JSONL file
            jsonl_file.write(json.dumps(structured_data,ensure_ascii=False) + '\n') # ensure_ascii=False ensures that special characters (e.g., Chinese characters) are stored in their original form

        

In [None]:

input_csv_path = 'path_to_the_csv_training_or_validation_dataset.csv'  # The file path of the csv dataset
output_jsonl_path = 'path_to_the_jsonl_training_or_validation_dataset.jsonl' # The jsonl file path where you want to save the file after convertion
csv_to_jsonl(input_csv_path, output_jsonl_path)

# Check the datasets

The following section is to preprocess and analyze the datasets used for fine-tuning a GPT model. It checks for format errors, provides basic statistics, and estimates token counts for fine-tuning costs. 

For more information, please check the OpenAI notebook page here: https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [None]:
# Data loading #

data_path = "path_to_the_dataset.jsonl" # enter your jsonl file dataset to be checked

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

In [None]:
# Data format validation / Format error checks #

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

In [8]:
# Define some token Counting Utilities #

encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Data warnings and tokens counts #

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

In [None]:
# Pricing and default n_epochs estimate #

MAX_TOKENS_PER_EXAMPLE = 16385 # insert the context window of the base GPT model to be fine-tuned

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Carry out a fine-tuning job

After checking the datasets, start a fine-tuning job with the given training and validation dataset
using the following function.

In [4]:
# Function to fine-tune a gpt model with a given training data and validation data
def fine_tuning_gpt_model(training_data_path, validation_data_path, gpt_model_name, fine_tuned_suffix):
    from openai import OpenAI
    client = OpenAI()


    # upload the training data and get its id
    training_response = client.files.create(
        file=open(training_data_path, "rb"), # the path of the training dataset
        purpose="fine-tune"
    )

    training_file_id = training_response.id

    # upload the validation data and get its id
    client = OpenAI()

    validation_response = client.files.create(
        file=open(validation_data_path, "rb"), # the path of the validation dataset
        purpose="fine-tune"
    )

    validation_file_id = validation_response.id
    

    # fine-tune a gpt model
    client.fine_tuning.jobs.create(
        training_file= training_file_id,  
        validation_file= validation_file_id,
        model= gpt_model_name,
        suffix= fine_tuned_suffix # suffix for the fine-tuned model
    )
  


When calling the fine-tuning function, indicate the training data path, validation data path, the name of the gpt model to be fine-tuned, and the suffix for the fine-tuned model.

In [None]:
client = OpenAI()
training_data_path = "path_to_the_training_dataset.jsonl" # the training data jsonl file path
validation_data_path = "path_to_the_validation_dataset.jsonl" # the validation data jsonl file path
gpt_model_name = "gpt-3.5-turbo-0125" # the gpt model to be fine-tuned
fine_tuned_suffix = "suffix_for_the model" # the suffix to be put in the fine-tuned model's name

fine_tuning_gpt_model(training_data_path = training_data_path, validation_data_path = validation_data_path, gpt_model_name = gpt_model_name, fine_tuned_suffix = fine_tuned_suffix)

# Model performance evaluation

Test the fine-tuned model and the original gpt model to compare their performances using the following function

In [None]:
# Function to test the model with the test dataset and write output to a csv file
def test_ft_model(model_name, data_path, csv_path):
    # Open the CSV file for writing
    # "utf-8-sig" adds BOM (Byte Order Mark) at the start when writing the csv file, which helps some programs(e.g. Excel) to identify UTF-8 encoding (e.g., Chinese characters).
    with open(csv_path, mode='w', newline='', encoding='utf-8-sig') as csv_file:
        # Create a CSV writer
        csv_writer = csv.writer(csv_file)
        # Write the header row
        csv_writer.writerow(['system', 'user', 'assistant'])
        
        # Open and read the test data file
        with open(data_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Each line is a JSON object representing a conversation instance
                conversation = json.loads(line)
                # Extract system and user content
                system_content = next((msg["content"] for msg in conversation["messages"] if msg["role"] == "system"), "")
                user_content = next((msg["content"] for msg in conversation["messages"] if msg["role"] == "user"), "")
                
                # Pass the conversation to the model
                completion = client.chat.completions.create(
                    model=model_name,
                    messages=conversation["messages"]
                )
                
                # Get the model's completion (assistant's response) for the current conversation instance
                assistant_content = completion.choices[0].message.content
                
                # Write the current conversation to the CSV file
                csv_writer.writerow([system_content, user_content, assistant_content])



Prepare the test dataset (skip this step if it's already prepared in the "prepare datasets" step)

In [None]:
# convert the test dataset into a jsonl file
input_csv_path = "path_to_the_csv_test_dataset.csv"
output_jsonl_path = "path_to_the_jsonl_test_dataset.jsonl"
csv_to_jsonl(input_csv_path, output_jsonl_path)

Test the fine-tuned GPT model

In [None]:
client = OpenAI()
# The fine_tuned model to be tested
model_name = "ft:gpt-the-fine-tuned-model's-name" # insert the fine-tuned model's name, which can be found at: https://platform.openai.com/finetune

# Path to your test dataset and the output CSV file
test_data_path = "path_to_the_test_dataset.jsonl"
output_csv_path = "path_to_the_output_dataset.csv"

# Call the function to test the model
test_ft_model(model_name, test_data_path, output_csv_path)

Test the non-fine-tuned base GPT model

In [None]:
client = OpenAI()
# The original non-fine-tuned gpt model
model_name_original = "gpt-3.5-turbo-0125" # insert the original gpt model's name

# Path to your test dataset and the output CSV file
test_data_path = "path_to_the_test_dataset.jsonl"
output_original_csv_path = "path_to_the_output_dataset.csv"

# Call the function test the model
test_ft_model(model_name_original, test_data_path, output_original_csv_path)