In [2]:
import json
import tiktoken
from typing import List, Dict


def load_json_file(file_path: str) -> List[Dict[str, List[Dict[str, str]]]]:
    """
    Load a JSON file and return its content.

    Parameters:
    file_path (str): The path to the JSON file.

    Returns:
    List[Dict[str, List[Dict[str, str]]]]: The content of the JSON file.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)


def count_tokens_in_messages(
    data: List[Dict[str, List[Dict[str, str]]]],
    encoding_name: str = "cl100k_base",
) -> int:
    """
    Count the number of tokens in a list of messages using the specified encoding.

    Parameters:
    data (List[Dict[str, List[Dict[str, str]]]]): The data loaded from the JSON file.
    encoding_name (str): The name of the encoding to use.

    Returns:
    int: The total number of tokens in the messages.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    total_tokens = 0
    for item in data:
        messages = item.get("messages", [])
        for message in messages:
            content = message.get("content", "")
            total_tokens += len(encoding.encode(content))
    return total_tokens


def main():
    # Load data from a JSON file
    file_path = "../data/interim/final_dataset.json"  # Replace this with the path to your JSON file
    data = load_json_file(file_path)

    # Count the number of tokens in the messages
    total_tokens = count_tokens_in_messages(data)
    print(f"Total tokens in JSON file: {total_tokens}")


if __name__ == "__main__":
    main()

Total tokens in JSON file: 1292541


In [5]:
# Given values
tokens_in_json_file = 1292541  # The total number of tokens in your JSON file
cost_per_1k_tokens = (
    0.0080  # The cost per 1,000 tokens for fine-tuning GPT-3.5 Turbo
)
number_of_epochs = 3  # The number of epochs you plan to train for (replace this with your actual number of epochs)

# Calculate total cost
total_tokens = (
    tokens_in_json_file * number_of_epochs
)  # Total number of tokens processed during fine-tuning
total_cost = (
    total_tokens / 1000
) * cost_per_1k_tokens  # Total cost of fine-tuning

# Print the result
print(f"Estimated total cost of fine-tuning: ${total_cost:.2f} USD")

Estimated total cost of fine-tuning: $31.02 USD


Given the information provided, you can estimate the total cost of fine-tuning using the formula provided. Here's how you can calculate it in Python:

```python
# Given values
tokens_in_json_file = 1292541  # The total number of tokens in your JSON file
cost_per_1k_tokens = 0.0080  # The cost per 1,000 tokens for fine-tuning GPT-3.5 Turbo
number_of_epochs = 3  # The number of epochs you plan to train for (replace this with your actual number of epochs)

# Calculate total cost
total_tokens = tokens_in_json_file * number_of_epochs  # Total number of tokens processed during fine-tuning
total_cost = (total_tokens / 1000) * cost_per_1k_tokens  # Total cost of fine-tuning

# Print the result
print(f"Estimated total cost of fine-tuning: ${total_cost:.2f} USD")
```

Replace `number_of_epochs` with the actual number of epochs you plan to
train for. When you run this script, it will calculate the total number
of tokens processed during fine-tuning (which is the number of tokens in
your JSON file multiplied by the number of epochs), then multiply this
by the cost per 1,000 tokens to get the total cost of fine-tuning. The
script will then print this total cost, formatted as a string with two
decimal places.

