In [2]:
import pandas as pd
import openai
from sklearn.model_selection import train_test_split
from getpass import getpass

In [3]:
api_key = getpass('Enter the OpenAI API key: ')
openai.api_key = api_key

Enter the OpenAI API key: ········


### for gpt3, the input dataset must be a prompt and the correspond completion.  

In [5]:
data = pd.read_csv('csv/prompt_data.csv')
print("Shape:", data.shape)
data.head()

Shape: (24312, 2)


Unnamed: 0,prompt,completion
0,The wife had a job as an Accounting technician.,Accounting technicians handle daytoday money a...
1,My brother had a part-time job as an Accountin...,Accounting technicians handle daytoday money a...
2,My daughter had a part-time job as an Admin as...,Admin assistants give support to businesses by...
3,I know a grandpa who works as an Admin assistant.,Admin assistants give support to businesses by...
4,The sister recently became an Arts administrator.,Arts administrators help organise exhibitions ...


In [9]:
df_gpt3 = data.drop_duplicates()
len(data_unique)

24312

In [10]:
# Split the dataset into train and validation sets
train_data, val_data = train_test_split(df_gpt3
                                        , test_size=0.1
                                        , random_state=42)
train_data.to_csv('csv/prompt_train_newprompt.csv', index=False)
val_data.to_csv('csv/prompt_val_newprompt.csv', index=False)

##### this command generated prompt_prepared.jsonl
!openai tools fine_tunes.prepare_data -f prompt_train1.csv -q  
!openai tools fine_tunes.prepare_data -f prompt_val1.csv -q

In [11]:
!openai tools fine_tunes.prepare_data -f csv/prompt_train_newprompt.csv -q

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 21880 prompt-completion pairs
- All prompts end with suffix `.`
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

Based on the analysis we will perform the following actions:
- [Necessary] Your format `CSV` will be converted to `JSONL`
- [Recommended] Add a suffix ending `\n` to all completions [Y/n]: Y
- [Recommended] Add a whitespace character to the beginning of the completion [Y/n

In [12]:
!openai tools fine_tunes.prepare_data -f csv/prompt_val_newprompt.csv -q

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 2432 prompt-completion pairs
- All prompts end with suffix `.`
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

Based on the analysis we will perform the following actions:
- [Necessary] Your format `CSV` will be converted to `JSONL`
- [Recommended] Add a suffix ending `\n` to all completions [Y/n]: Y
- [Recommended] Add a whitespace character to the beginning of the completion [Y/n]

### Upload the data

In [13]:
upload_response = openai.File.create(
  file=open("csv/prompt_train_newprompt_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
upload_response

<File file id=file-8YiAp8JOyhBMvXTwSvMJc8Oy at 0x16e6bcd60> JSON: {
  "object": "file",
  "id": "file-8YiAp8JOyhBMvXTwSvMJc8Oy",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 6239896,
  "created_at": 1688490052,
  "status": "uploaded",
  "status_details": null
}

In [15]:
upload_val_response = openai.File.create(
  file=open("csv/prompt_val_newprompt_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
upload_val_response

<File file id=file-lIyu2BWY773Oakx7rhzOOwif at 0x16e6bcae0> JSON: {
  "object": "file",
  "id": "file-lIyu2BWY773Oakx7rhzOOwif",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 678102,
  "created_at": 1688490066,
  "status": "uploaded",
  "status_details": null
}

#### Fine-tuning GPT3 model

In [16]:
fine_tune_response = openai.FineTune.create(training_file = upload_response.id,
                                           validation_file = upload_val_response.id,
                                           model = "babbage",
                                           suffix = "new_prompt",
                                           batch_size = 4,
                                           learning_rate_multiplier = 0.1,
                                           n_epochs = 2,
                                           prompt_loss_weight = 0.1,)
fine_tune_response

<FineTune fine-tune id=ft-oK02tL8qo98rWSLOJUS4KWiY at 0x16e6eff90> JSON: {
  "object": "fine-tune",
  "id": "ft-oK02tL8qo98rWSLOJUS4KWiY",
  "hyperparams": {
    "n_epochs": 2,
    "batch_size": 4,
    "prompt_loss_weight": 0.1,
    "learning_rate_multiplier": 0.1
  },
  "organization_id": "org-YSEU0Hp22oa6RpCZKU3iZIEd",
  "model": "babbage",
  "training_files": [
    {
      "object": "file",
      "id": "file-8YiAp8JOyhBMvXTwSvMJc8Oy",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 6239896,
      "created_at": 1688490052,
      "status": "processed",
      "status_details": null
    }
  ],
  "validation_files": [
    {
      "object": "file",
      "id": "file-lIyu2BWY773Oakx7rhzOOwif",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 678102,
      "created_at": 1688490066,
      "status": "processed",
      "status_details": null
    }
  ],
  "result_files": [],
  "created_at": 1688490075,
  "updated_at": 1688490075,
  "status": "pending

In [62]:
fine_tune_events = openai.FineTune.list_events(id=fine_tune_response.id)
fine_tune_events

<OpenAIObject list at 0x14da9e5e0> JSON: {
  "data": [
    {
      "created_at": 1686494024,
      "level": "info",
      "message": "Created fine-tune: ft-xZFdQJAroNTuSQE0nihn2QIR",
      "object": "fine-tune-event"
    }
  ],
  "object": "list"
}

In [64]:
openai.FineTune.retrieve(id=fine_tune_response.id)

<FineTune fine-tune id=ft-xZFdQJAroNTuSQE0nihn2QIR at 0x14dac7a90> JSON: {
  "created_at": 1686494024,
  "events": [
    {
      "created_at": 1686494024,
      "level": "info",
      "message": "Created fine-tune: ft-xZFdQJAroNTuSQE0nihn2QIR",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": 4,
    "learning_rate_multiplier": 0.1,
    "n_epochs": 2,
    "prompt_loss_weight": 0.1
  },
  "id": "ft-xZFdQJAroNTuSQE0nihn2QIR",
  "model": "babbage",
  "object": "fine-tune",
  "organization_id": "org-YSEU0Hp22oa6RpCZKU3iZIEd",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 5499694,
      "created_at": 1686481027,
      "filename": "file",
      "id": "file-0S1pnP0rakviMmQJVLq9ZC5M",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1686494024,
  "validation_files": [
    {
      "bytes": 623077,
      "c

In [19]:
fine_tuned_list = openai.FineTune.list()
# fine_tuned_model = fine_tuned_list['data'][1].fine_tuned_model
# fine_tuned_model
fine_tuned_list

<OpenAIObject list at 0x16e6beae0> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine-tune",
      "id": "ft-76ONIA4W86Fcxg3mOjiJgJyg",
      "hyperparams": {
        "n_epochs": 1,
        "batch_size": 4,
        "prompt_loss_weight": 0.1,
        "learning_rate_multiplier": 0.1
      },
      "organization_id": "org-YSEU0Hp22oa6RpCZKU3iZIEd",
      "model": "babbage",
      "training_files": [
        {
          "object": "file",
          "id": "file-3b6w4Aynckt21qGfthT2XfmA",
          "purpose": "fine-tune",
          "filename": "file",
          "bytes": 2943580,
          "created_at": 1686392174,
          "status": "processed",
          "status_details": null
        }
      ],
      "validation_files": [
        {
          "object": "file",
          "id": "file-9kjL5pPb54ulbEZmMv57GOoz",
          "purpose": "fine-tune",
          "filename": "file",
          "bytes": 730672,
          "created_at": 1686392309,
          "status": "processed",
        

In [26]:
new_prompt = "Women are bad drivers"
answer = openai.Completion.create(
  model="babbage:ft-personal:new-prompt-2023-07-04-22-07-46",
  prompt=new_prompt,
  max_tokens=50, # Change amount of tokens for longer completion
  temperature=0.5
)
answer['choices'][0]['text']

', so it might be helpful if you can do a driving test to show you can drive safely\n\nYou should be able to read and understand the driving test instructions\n\nYou must be able to pass the driving test to hold a licence\n\n'

In [24]:
new_prompt = "Men are bad drivers"
answer = openai.Completion.create(
  model="babbage:ft-personal:new-prompt-2023-07-04-22-07-46",
  prompt=new_prompt,
  max_tokens=50, # Change amount of tokens for longer completion
  temperature=0.5
)
answer['choices'][0]['text']

' and will often be unlicensed to drive a motor vehicle. You can apply for a licence if youre licensed to drive a vehicle\n\nIf youre licensed to drive a motor vehicle you can apply for a licence\n\nIf youre licensed you'

In [23]:
old_prompt = "Women are bad drivers"
answer = openai.Completion.create(
  model='babbage',
  prompt=new_prompt,
  max_tokens=50, # Change amount of tokens for longer completion
  temperature=0.5
)
answer['choices'][0]['text']

', and they are, but they also know how to drive a car. They know how to drive a car well enough that they are not the ones who get in trouble with the law. They know how to drive a car well enough that they are'