### 1. Prepare training data

In [None]:
from datasets import load_dataset
import pandas as pd
import json

In [None]:
from IPython.display import Image
Image("https://scienceqa.github.io/img/scienceqa.png")

In [None]:
dataset = load_dataset("derek-thomas/ScienceQA")
dataset

In [None]:
dft = pd.DataFrame(dataset["train"][:10])
dft.head(2)

In [None]:
{
    "messages": [
        {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
        {"role": "user", "content": "What's the capital of France?"}, 
        {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}
    ]
}

In [None]:
choice_prefixes = [chr(ord("A") + i) for i in range(26)] # A-Z
def format_options(row):
    return " ".join([f"({c}) {o}" for c, o in zip(choice_prefixes, row["choices"])])

In [None]:
dft["choices_formatted"] = dft.apply(format_options, axis=1)
print(dft["choices"][0])
print(dft["choices_formatted"][0])

In [None]:
map_dct = {0: "A", 1: "B", 2: "C", 3: "D"}
dft.replace({"answer": map_dct}, inplace=True)
dft.head(2)

In [None]:
def combine_columns(row):
    ret = "Context: " + row["lecture"] + ", " + "Question: " + row["question"] + ", " + "Options: " + row["choices_formatted"]
    return ret

In [None]:
dft["user_content"] = dft.apply(combine_columns, axis=1)
dft["user_content"][0]

In [None]:
DEFAULT_SYSTEM_PROMPT = """
Role: Science Teacher Assistant

Task: Respond accurately to multiple-choice science questions provided by the teacher.

Instructions:
1. For each question, you will be given up to four answer choices labeled A, B, C, or D.
2. Carefully evaluate the question and select the most appropriate answer from the provided choices.

Output: Return only the letter corresponding to the correct answer (A, B, C, or D)."""

In [None]:
def get_example(user_content, answer):  
    return {  
        "messages": [  
            {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},  
            {"role": "user", "content": user_content},  
            {"role": "assistant", "content": answer}
        ]  
    }

In [None]:
with open("train.jsonl", "w") as f:  
    for i, row in list(dft.iterrows()):  
        question = row["user_content"]  
        answer = row["answer"]  
        example = get_example(question, answer)  
        example_str = json.dumps(example)  
        f.write(example_str + "\n")

In [None]:
!head -n 1 train.jsonl

In [None]:
!tail -n 1 train.jsonl

### 2. Validate training data

In [None]:
from collections import defaultdict

In [None]:
with open("train.jsonl", "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

In [None]:
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

In [None]:
import tiktoken 
import numpy as np

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 65536 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 65536 token limit, they will be truncated during fine-tuning")

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

### 3. Finetune model


In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
response = client.files.create(
    file=open("train.jsonl", "rb"),
    purpose="fine-tune"
)
print(response)

In [None]:
file_id = response.id
print(file_id)

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={"n_epochs": 2}
)

print("Fine-tune job is started")
ft_job_id = response.id

In [None]:
response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=ft_job_id, limit=10)
response

In [None]:
response = client.fine_tuning.jobs.retrieve(ft_job_id)
print(response)

In [None]:
result_files = response.result_files
result_files

### 4. Use the finetuned model

In [None]:
fine_tuned_model = response.fine_tuned_model
fine_tuned_model

In [None]:
user_content = """
Context: Chemical changes and physical changes are two common ways matter can change.
\nIn a chemical change, the type of matter changes. The types of matter before and after 
a chemical change are always different.\nBurning a piece of paper is a chemical change. 
When paper gets hot enough, it reacts with oxygen in the air and burns. The paper and 
oxygen change into ash and smoke.\nIn a physical change, the type of matter stays the 
same. The types of matter before and after a physical change are always the same.
\nCutting a piece of paper is a physical change. The cut pieces are still made of paper.
\nA change of state is a type of physical change. For example, ice melting is a physical 
change. Ice and liquid water are made of the same type of matter: water., 

Question: Complete the sentence.\nSewing an apron is a ()., 

Options: (A) chemical change (B) physical change"""

In [None]:
completion = client.chat.completions.create(
    model=fine_tuned_model,
    messages=[
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": user_content}
    ]
)
print(completion.choices[0].message)

In [None]:
client.models.delete(fine_tuned_model)