<a href="https://colab.research.google.com/github/LennyBijan/EMW-Finetuning/blob/main/EMW_Dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Necessary pip installs

In [None]:
!pip install -Uq wandb openai tiktoken datasets tenacity

# Dataset Validation & Pricing

In [7]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [8]:
data_path = "test_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 16
First example:
{'role': 'system', 'content': 'Du bist eine weise Erzählmeisterin, die es liebt, Kindern Geschichten zu erzählen. Die Kinder geben dir ihre Ideen in Form von Charakteren, Orten, Aktivitäten und Emotionen. Deine Aufgabe ist es, diese Elemente in moralisch wertvolle Geschichten zu verwandeln, die nicht nur unterhalten, sondern auch lehrreiche Werte vermitteln.'}
{'role': 'user', 'content': 'Liebe Erzählmeisterin, erfinde eine Geschichte für Kinder über einen hungrigen kleinen Kater namens Leo und seine liebevolle Familie, die in der großen bunten Küche zu Hause Waffeln backen, mit Mehl spielen und heimlich Schleckereien stehlen. Die Charaktere fühlen sich aufgeregt, neugierig, ein bisschen traurig und müde. Der Modus der Geschichte soll spannend sein. Ich bin gespannt, wie du diese Elemente in die Erzählung einfließen lässt!'}
{'role': 'assistant', 'content': 'An einem Sonntag erwartete die Familie von Kater Leo Besuch. Leo genoss es jedes Mal sehr, wenn G

In [9]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [10]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16385 token limit, they will be truncated during fine-tuning")

In [12]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 6
n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
price = 0.008
pricing_model = round(n_billing_tokens_in_dataset/1000*price, 3)
pricing_model_full = round(pricing_model * n_epochs, 3)

print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"With the current pricing, you will be charged for ~{pricing_model}$ for one epoch, and ~{pricing_model_full}$ for a full training run @ ~{n_billing_tokens_in_dataset * n_epochs} Tokens")
print(f"Assuming, we scale this dataset to 150 Stories, we will be charged ~{round(n_billing_tokens_in_dataset*10/1000*price*n_epochs, 3)}$ for a full training run @ ~{n_billing_tokens_in_dataset * 10 * n_epochs} Tokens")


Dataset has ~17780 tokens that will be charged for during training
With the current pricing, you will be charged for ~0.142$ for one epoch, and ~0.852$ for a full training run @ ~106680 Tokens
Assuming, we scale this dataset to 150 Stories, we will be charged ~8.534$ for a full training run @ ~1066800 Tokens


# Uploading Dataset

In [None]:
from openai import OpenAI
from google.colab import userdata
api_key = userdata.get('api_key')

client = OpenAI(api_key=api_key)

client.files.create(
  file=open("test_data.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
from openai import OpenAI
from google.colab import userdata
userdata.get('api_key')

client = OpenAI(api_key=api_key)

client.files.list()


# Create Fine-tuned model

In [None]:
import wandb
wandb.login()

In [None]:
from openai import OpenAI
from google.colab import userdata
from wandb.integration.openai.fine_tuning import WandbLogger

WANDB_PROJECT = "EMW Finetuning"
api_key = userdata.get('api_key')
client = OpenAI(api_key=api_key)

openai_ft_job_info = client.fine_tuning.jobs.create(
  training_file="file-o968RVPJ3yMK2TDQFUix9gRg",
  model="gpt-3.5-turbo",
  hyperparameters={
    "n_epochs":6,
    "learning_rate_multiplier":2
  }
)

ft_job_id = openai_ft_job_info.id
WandbLogger.sync(fine_tune_job_id=ft_job_id, project=WANDB_PROJECT, openai_client=client)


In [None]:
wandb.finish()