In [4]:
from make_data import load_dict_from_json
from tools import *
import json
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("FUSE_OPEN_AI_KEY"))

import time

import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

# Prepare data

In [5]:
database = load_dict_from_json('data/M0A_train_data.json')
test_sample = select_random_keys(database, 1000, 45)

In [6]:
themes = {
'Conscience': 'T1',
'Desire': 'T2',
'Freedom': 'T3',
'Goodness': 'T4',
'Identity': 'T5',
'Justice': 'T6',
'Language': 'T7',
'Meaning': 'T8',
'Science': 'T9',
'Technology': 'T10',
'Truth': 'T11',
'Time': 'T12',
'Existence': 'T13',
'Music': 'T14',
'Imagination': 'T15',
'The Unconscious': 'T16',
'Education': 'T17',
'Body & Mind': 'T18',
'Beauty': 'T19',
'Art': 'T20',
'Love': 'T21',
'Reality': 'T22',
'Politics': 'T23',
'Work': 'T24',
'Living Together': 'T25',
'Philosophy': 'T26',
'Matter': 'T27',
'Death': 'T28',
'Religion': 'T29',
'History': 'T30',
'Thought': 'T31',
'Madness': 'T32',
'Joy & Happiness': 'T33'}

In [7]:
trainning_dict = {}
for text in test_sample:
    trainning_dict[text] = str([themes[i] for i in list(database[text])]).replace("'", '')

In [12]:
system_instructions = """
You are a professor of the Humanities (Litterature, Philosophy, Poetry mostly) and as such, you are giving the task to classify texts related to the humanities by themes. You have been provided with texts from which you must exclusively derive its theme(s).

You are given the text of a page from a digital book and a fixed list of 33 themes. Your task is to determine which of these themes are relevant to the content of the page.
A page can be allocated to more than one theme. However, a page CANNOT be allocated to more than six themes. Only allocate a theme if you are very confident that it is a relevant theme.
Remember that it is better to have less relevant themes allocated than many that are irrelevant!
Return the allocation in the format: [T<theme_number>, T<theme_number>, ...]
Below are the 33 themes along with their guiding questions and descriptions:

T1. Consciousness
Is consciousness a uniquely human trait and to what extent our individual conscience is shaped by social influences?

T2. Desire
The relationship between desire, satisfaction, and human nature, examining whether desires can be fulfilled, consciously known, their impact on our wellbeing, and their influence on our behavior and pursuit of truth and goodness.

T3. Freedom
The relationship between consciousness, personal autonomy, and determinism, examining whether awareness and choice are truly liberating factors in human freedom.

T4. Goodness
Aspects of moral goodness, examining whether education, perception, intention, and human nature influence ethical behavior and moral development.

T5. Identity
Facets of personal identity, examining how self-awareness, change, relationships, work, and choices contribute to our understanding of who we are.

T6. Justice
Dimensions of justice, examining its relationship with freedom, law, conventions, experience, state power, moral choices, and democratic systems.

T7. Language
The relationship between language and thought, examining whether language serves as a barrier, tool, or mediator in human communication, understanding, and expression.

T8. Meaning
What has meaning ?

T9. Science
The fundamental questions raisend by science about knowledge, truth, and human understanding - from the possibility of scientific knowledge about life to the relationship between reason, experience, belief, and certainty.

T10. Technology
What is the relationship between technological progress, human freedom, and our connection to nature?

T11. Truth
Dimensions of truth, including its dependence on human perception, methods of verification, relationship with science and politics, and the nature of certainty and doubt.

T12. Time
Aspects of time, including its relationship with freedom, happiness, destruction, human limitations, knowledge, efficiency, novelty, and leisure.

T13. Existence
Existence encompasses both action and contemplation, raising questions about how we engage with life's moments and opportunities.

T14. Music

T15. Imagination
Does imagination enrich knowledge and what it means to lack of imagination?

T16. The Unconscious
The compatibility of the unconscious with freedom, its relationship with self-awareness, and its influence on human expression.

T17. Education
The relationship between culture, human nature, and personal development, examining whether cultural education liberates us, shapes our character, influences our happiness, and affects our moral development.

T18. Body & Mind
What difference can be made between the mind and the body?

T19. Beauty
Dimensions of beauty, from its transformative power on consciousness to its relationship with utility, happiness, and religious experience.

T20. Art
Aspects of art, including its relationship with understanding, truth, reality, education, language, beauty, meaning, joy and necessity.

T21. Love
Dimensions of love, including its rationality, universality, self-knowledge, and distinctions from other forms of human connection.

T22. Reality
The nature of reality and our ability to perceive and understand it, examining aspects like perception, the reliability of appearances, intuition, judgment, and the distinction between dreams and reality.

T23. Politics
What are the foundations and limits of political authority and human social organization?

T24. Work
Questions about work, examining its necessity, social impact, virtue, time value, and technical nature.

T25. Living Together
Questions exploring various aspects of social living, including moral obligations, duty, conflict, responsibility, and the relationship between individual and collective happiness.

T26. Philosophy
The relationship between philosophy and fundamental concepts like happiness, governance, and religion.

T27. Matter
Does the mind have access to matter and what is matter?

T28. Death
What are the fundamental questions about mortality and our ability to comprehend and accept death?

T29. Religion
Aspects of religion's necessity for humanity, its relationship with reason, and its cultural origins.

T30. History
Questions about history's nature, examining its scientific status, its relevance to the future, its role in political decision-making, and the agency behind historical events.

T31. Thought
Thought: its limitations, the nature of ideas, and our ability to comprehend origins.

T32. Madness

T33. Joy & Happiness
Aspects of happiness, including its relationship with truth, consciousness, culture, and well-being, examining whether happiness is achievable, personal, or compatible with the realities of existence.

DO NOT provide explanations, only provide the allocation code line.
DO NOT provide the title of each theme such as [T7. Language, T5. Identity], only provide the theme code such as [T7, T5].
Only allocate a theme if you are very confident that it is relevant.
    """

In [13]:
# import json
# import re

# def clean_text(text: str) -> str:
#     # 1) Remove newline characters:
#     text = text.replace("\n", " ")

#     # 2) Remove 4-digit years like 2019, 2020, etc.:
#     text = re.sub(r"\b\d{4}\b", "", text)

#     # 3) Reduce multiple spaces to a single space:
#     text = re.sub(r"\s+", " ", text)

#     # 4) Strip leading/trailing spaces:
#     text = text.strip()

#     return text

# # Example usage in your loop
# output_file = "data/training_data_1000.jsonl"

# with open(output_file, "w", encoding="utf-8") as f:
#     for text, themes in trainning_dict.items():
#         # Clean the text before using it
#         cleaned_text = clean_text(text)

#         messages = [
#             {"role": "system", "content": system_instructions},
#             {"role": "user", "content": cleaned_text},
#             {"role": "assistant", "content": themes}
#         ]

#         json_line = {"messages": messages}
#         f.write(json.dumps(json_line, ensure_ascii=False) + "\n")


## Test data

In [65]:
data_path = "data/training_data_100.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 1000
First example:
{'role': 'system', 'content': "\nYou are a professor of the Humanities (Litterature, Philosophy, Poetry mostly) and as such, you are giving the task to classify texts related to the humanities by themes. You have been provided with texts from which you must exclusively derive its theme(s).\n\nYou are given the text of a page from a digital book and a fixed list of 33 themes. Your task is to determine which of these themes are relevant to the content of the page.\nA page can be allocated to more than one theme. However, a page CANNOT be allocated to more than six themes. Only allocate a theme if you are very confident that it is a relevant theme.\nRemember that it is better to have less relevant themes allocated than many that are irrelevant!\nReturn the allocation in the format: [T<theme_number>, T<theme_number>, ...]\nBelow are the 33 themes along with their guiding questions and descriptions:\n\nT1. Consciousness\nIs consciousness a uniquely human tr

In [66]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [67]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [68]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 1175, 10055
mean / median: 1766.392, 1538.0
p5 / p95: 1252.9, 2514.1

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 48
mean / median: 7.788, 6.0
p5 / p95: 3.0, 15.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [69]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~1766392 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~5299176 tokens


In [70]:

# Pricing per 1,000,000 tokens (updated pricing)
PRICING = {
    "gpt-4.0": 25.000,  # $25.00 per 1,000,000 training tokens
    "gpt-4.0-mini": 3.000,  # $3.00 per 1,000,000 training tokens
    "gpt-3.5": 8.000,  # $8.00 per 1,000,000 training tokens
    "davinci-002": 6.000,  # $6.00 per 1,000,000 training tokens
    "babbage-002": 0.400  # $0.40 per 1,000,000 training tokens
}

def calculate_fine_tuning_cost_from_tokens_and_epochs(total_billing_tokens, n_epochs, model_name):
    # Cost calculation
    cost_per_1m_tokens = PRICING.get(model_name, 0)
    total_cost = (total_billing_tokens / 1_000_000) * cost_per_1m_tokens * n_epochs

    return {
        "model": model_name,
        "n_epochs": n_epochs,
        "total_billing_tokens": total_billing_tokens,
        "cost": total_cost
    }

# Example usage
total_billing_tokens = n_billing_tokens_in_dataset  # Replace with the actual total billing tokens
n_epochs = 3  # Replace with the actual number of epochs
models = ["gpt-4.0", "gpt-4.0-mini", "gpt-3.5", "davinci-002", "babbage-002"]

for model in models:
    result = calculate_fine_tuning_cost_from_tokens_and_epochs(total_billing_tokens, n_epochs, model)
    print(f"{result['model']}")
    print(f"  Total cost: ${result['cost']:.2f}")


gpt-4.0
  Total cost: $132.48
gpt-4.0-mini
  Total cost: $15.90
gpt-3.5
  Total cost: $42.39
davinci-002
  Total cost: $31.80
babbage-002
  Total cost: $2.12


# Fine-tune

In [27]:
# client.files.list()
# client.models.list()

In [None]:
# # 1) Upload the training file

# upload_response = client.files.create(
#     file=open("data/training_data_100.jsonl", "rb"),
#     purpose="fine-tune"
# )
# training_file_id = upload_response.id
# print("Uploaded file ID:", training_file_id)

# # 'file-Pia7c45jQx5mVEvTYrKHuL'

In [None]:
# # 2) Create the fine-tuning job

# fine_tune_response = client.fine_tuning.jobs.create(
#     training_file="file-Pia7c45jQx5mVEvTYrKHuL",
#     model="gpt-4o-mini-2024-07-18",
#     method={'type': 'supervised',
#             'supervised':{
#             'hyperparameters': {'n_epochs': 3}}}
# )

# job_id = fine_tune_response.id
# print("Fine-tune job created. Job ID:", job_id)

Fine-tune job created. Job ID: ftjob-gq7UKbBfZgtpKgzC9wqsgylG


In [16]:
# check every 60 sec if job is done

while True:
    job = client.fine_tuning.jobs.retrieve('ftjob-gq7UKbBfZgtpKgzC9wqsgylG')
    print("Job status:", job.status)
    if job.status == "succeeded":
        print("Fine-tuning job succeeded")
        break
    elif job.status == "failed":
        print("Fine-tuning job failed")
        break
    time.sleep(60)

Job status: succeeded
Fine-tuning job succeeded


In [17]:
client.fine_tuning.jobs.retrieve('ftjob-gq7UKbBfZgtpKgzC9wqsgylG')

# 'ft:gpt-4o-mini-2024-07-18:fusegpt::AuMp0bXY'

FineTuningJob(id='ftjob-gq7UKbBfZgtpKgzC9wqsgylG', created_at=1737994944, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:fusegpt::AuMp0bXY', finished_at=1737997456, hyperparameters=Hyperparameters(batch_size=2, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-8uLnwdNry9jphQFFFLaqCG2v', result_files=['file-RX2WGg3Sbhn5L3C9PusFYz'], seed=2100935324, status='succeeded', trained_tokens=5057586, training_file='file-Pia7c45jQx5mVEvTYrKHuL', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=2, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None)