In [None]:
import openai
import os
import pandas as pd

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
qa_df = pd.read_csv("python_qa.csv")
qa_df.head()

In [None]:
qa_df.shape
# I need body from this dataset as prompt and answer as the completion.

In [None]:
questions = qa_df["Body"]
answers = qa_df["Answer"]

In [None]:
questions

In [None]:
# I need to create json format that needed by OpenAI
qa_openai_format = [{"prompt":q,"completion":a} for q,a in zip(questions,answers)]
qa_openai_format[4]

In [None]:
# In this part, before fine-tuning the model, it would be nice to count the tokens and calculate the cost. In that case, tiktoken library from openai gives the
# token count in the way openai counts it.
import tiktoken

In [None]:
# lets create a func that returns the number of tokens from a string
def num_tokens_from_string(string, enconding_name):
    encoding = tiktoken.get_encoding(enconding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# Now, I will take the first 500 entries of my dataset to be used as an example for the fine tuning and create the necessary JSON file.
import json
dataset_size = 500

with open("example_training_data.json","w") as f:
    for entry in qa_openai_format[:500]:
        f.write(json.dumps(entry))
        f.write("\n")

In [None]:
# let's count the tokens and words in the example dataset and then calculate the cost for babbage model
token_counter = 0
word_counter = 0 

for prompt_collection in qa_openai_format[:500]:
    for prompt, completion in prompt_collection.items():
        token_counter+= num_tokens_from_string(prompt, "gpt2")
        token_counter+= num_tokens_from_string(completion, "gpt2")
        word_counter+= len(prompt.split())
        word_counter+= len(completion.split())


In [None]:
token_counter

In [None]:
word_counter

In [None]:
0.0006*4*token_counter/1000
# Fine tuning with this dataset will cost around $0.47 or 47 cents. 

After we created this json file to do fine-tuning, openai recommends to use command line tool to perform it.

Here are the commands for that:

1. enter your openai API --> export OPENAI_API_KEY=<openai_api_key>
2. openai api fine_tunes.create -t "path_to_json_file" -m <name_of_the_model_eg_babbage>
3. It will take some time, if you still want to see the stream: openai api fine_tunes.follow -i <id_delivered>
4. If you want to cancel fine-tuning: openai api fine_tunes.cancel -i <id_delivered>
5. In the end, you will see the name of the model in the command line. You can call this model in Python IDE directly.


In [None]:
# So, now we can use our fine-tuned model
fine_tuned_model = "babbage:ft-personal-2023-03-08-15-54-05" # I got this from the command line when fine tuning is finished

In [None]:
# Lets use it
response = openai.Completion.create(
    model = fine_tuned_model,
    prompt = "What are good Python books?",
    max_tokens = 128
)

In [None]:
prompt_count = num_tokens_from_string("What are good Python books?","gpt2")
prompt_count

In [None]:
print(response['choices'][0]["text"])

In [None]:
response

In [None]:
(0.0024/1000)*134