# Test the connection and API key

Make sure it's possible to use the OpenAI API. For this to work, the environment variable OPENAI_API_KEY must be set to a valid API key which has available credits.

In [1]:
import openai
import os

# read the OpenAI API key from an environment variable
openai.api_key = os.environ['OPENAI_API_KEY']

# test the API connection by making a simple request
response = openai.completions.create(model="davinci-002", prompt="Say this is a test", temperature=0, max_tokens=7)
print(response)
print(response.choices[0].text)

Completion(id='cmpl-8T6I3BMP97D6HKkwtlUadVJimIfWw', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=". I'm gonna say this is")], created=1701946383, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=7, prompt_tokens=5, total_tokens=12))
. I'm gonna say this is


# Prepare the fine-tuning set

Prepare a fine-tuning dataset and use it to fine-tune a GPT3 model.

In [8]:
import glob
import json
import tiktoken

PROMPT_SUFFIX = '\n\n###\n\n'
COMPLETION_STOP = '\n###'
TRAINFILE = 'fine-tune.jsonl'
VALIDATEFILE = 'validate.jsonl'
BASE_MODEL = 'babbage-002'
MAX_TOKENS = 12000  # Increased for babbage-002 model

dataset_train_files = glob.glob("../../llm-dataset/*-train.jsonl")
dataset_test_files = glob.glob("../../llm-dataset/*-test.jsonl")

encoding = tiktoken.encoding_for_model(BASE_MODEL)

def truncate_text(text):
    """truncate text so it contains at most MAX_TOKENS according to the OpenAI tokenizer"""
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:MAX_TOKENS])

def create_sample(text, metadata):
    """create a fine-tuning sample from text and metadata about a single document"""
    return {'prompt': truncate_text(text) + PROMPT_SUFFIX,
            'completion': " " + metadata + COMPLETION_STOP}

def convert_to_samples(infiles, outfile):
    print(f"Creating {outfile}")
    nrec = 0
    with open(outfile, "w") as outf:
        for infile in infiles:
            print(f"- processing {infile}")
            with open(infile) as inf:
                for line in inf:
                    rec = json.loads(line)
                    sample = create_sample(rec["text"], rec["metadata"])
                    print(json.dumps(sample), file=outf)
                    nrec += 1
    print(f"{nrec} records converted")
    print()

convert_to_samples(dataset_train_files, TRAINFILE)
convert_to_samples(dataset_test_files, VALIDATEFILE)

Creating fine-tune.jsonl
- processing ../../llm-dataset/serial-fin-train.jsonl
- processing ../../llm-dataset/serial-swe-train.jsonl
- processing ../../llm-dataset/docthes-eng-train.jsonl
- processing ../../llm-dataset/docthes-fin-train.jsonl
- processing ../../llm-dataset/thes-fin-train.jsonl
- processing ../../llm-dataset/thes-swe-train.jsonl
- processing ../../llm-dataset/mono-swe-train.jsonl
- processing ../../llm-dataset/docthes-swe-train.jsonl
- processing ../../llm-dataset/mono-eng-train.jsonl
- processing ../../llm-dataset/mono-fin-train.jsonl
- processing ../../llm-dataset/thes-eng-train.jsonl
- processing ../../llm-dataset/serial-eng-train.jsonl
557 records converted

Creating validate.jsonl
- processing ../../llm-dataset/mono-swe-test.jsonl
- processing ../../llm-dataset/docthes-swe-test.jsonl
- processing ../../llm-dataset/thes-eng-test.jsonl
- processing ../../llm-dataset/thes-swe-test.jsonl
- processing ../../llm-dataset/thes-fin-test.jsonl
- processing ../../llm-dataset/

In [9]:
# Check that the fine-tuning data set is OK using the prepare_data tool.
# We will only use prepare_data as a validation aid and delete the "prepared"
# files that it helpfully creates.
!openai tools fine_tunes.prepare_data -f fine-tune.jsonl -q
!rm -f fine-tune_prepared.jsonl

!openai tools fine_tunes.prepare_data -f validate.jsonl -q
!rm -f validate_prepared.jsonl

Analyzing...

- Your file contains 557 prompt-completion pairs
- There are 7 examples that are very long. These are rows: [231, 257, 258, 268, 333, 468, 504]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- All prompts end with suffix `\n\n###\n\n`
- All completions end with suffix `\n###`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 7 long examples [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified file to `fine-tune_prepared.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "fine-tune_prepared.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\n\n###\n\n` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=["\n###"]` so that the generated texts ends at the expected place.
Once your m

In [10]:
# OpenAI API and client have changed, now finetuning can or needs to be done
# with Python code, not CLI client

# Upload training data

upload_response = openai.files.create(
    file=open(TRAINFILE, "rb"),
    purpose="fine-tune"
)
trainfile_id = upload_response.id
upload_response

FileObject(id='file-tiZAgq70qPmQa7KDYHFP5Bzg', bytes=2970399, created_at=1701947205, filename='fine-tune.jsonl', object='file', purpose='fine-tune', status='uploaded', status_details=None)

In [11]:
# Perform the actual finetuning via the API. This can take a while, there can be a long queue.

openai.fine_tuning.jobs.create(
    training_file=trainfile_id,
    model="babbage-002"
)

FineTuningJob(id='ftjob-f3Plu07lCRlZlZYaWjCiWKLR', created_at=1701947281, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-5QEUW2DacClOLTNQvTEKMHdV', result_files=[], status='validating_files', trained_tokens=None, training_file='file-tiZAgq70qPmQa7KDYHFP5Bzg', validation_file=None)

In [14]:
fine_tuning_job_id = openai.fine_tuning.jobs.list(limit=10).data[0].id
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tuning_job_id, limit=20).data

[FineTuningJobEvent(id='ftevent-uZAdFSd2qHjZsgf2DOe4LwN9', created_at=1701949004, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-ChGZH2rgFtkoibq4ygIhNu5G', created_at=1701949002, level='info', message='New fine-tuned model created: ft:babbage-002:personal::8T6yHGdp', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-5w8o0kPU98fSSZ6BHZXWU3HK', created_at=1701948982, level='info', message='Step 1601/1671: training loss=0.47', object='fine_tuning.job.event', data={'step': 1601, 'train_loss': 0.47158852219581604, 'train_mean_token_accuracy': 0.8634920716285706}, type='metrics'),
 FineTuningJobEvent(id='ftevent-v36VuJqsnsZHi7MkVEZctCPA', created_at=1701948963, level='info', message='Step 1501/1671: training loss=0.22', object='fine_tuning.job.event', data={'step': 1501, 'train_loss': 0.2230607271194458, 'train_mean_token_accuracy': 0.9497487545013428

In [16]:
# store the model name from above fine tuning job

model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_id).fine_tuned_model
model_name

'ft:babbage-002:personal::8T6yHGdp'

In [20]:
# Try out the fine-tuned model on a random test set record

import random

def get_completions(text):
    response = openai.completions.create(
                                    model=model_name,
                                    prompt=truncate_text(text) + PROMPT_SUFFIX,
                                    temperature=0,  # no fooling around!
                                    max_tokens=2048, # should be very plenty
                                    stop=[COMPLETION_STOP])  # stop at ###
    return response.choices[0].text.strip()

test_set_file = random.choice(dataset_test_files)
with open(test_set_file) as testfile:
    records = [json.loads(line) for line in testfile]
rec = random.choice(records)

print(f"Testing on {rec['id']} with PDF {rec['url']}")
print("---")
print("Curated metadata:")
print(rec["metadata"])
print("---")
print("Generated metadata:")
print(get_completions(rec["text"]))


Testing on https://www.theseus.fi/handle/10024/498793 with PDF https://www.theseus.fi/bitstream/handle/10024/498793/Eklund_Liz_Marjanen_Emma.pdf
---
Curated metadata:
Author: Eklund, Liz
Author: Marjanen, Emma
Organization: Högskolan på Åland
Issued: 2021
URN: URN:NBN:fi:amk-2021052110375
Language: swe
Publisher: Högskolan på Åland
ISSN (online): 1458-1531
Degree program: Utbildningsprogrammet för företagsekonomi
Discipline: Företagsekonomi, förvaltning och marknadsföring
Title: Hållbara inköp hos två åländska företag : verksamma inom plastindustrin
Alternative title: Sustainable Purchases at two Åland Companies - active in the plastics industry
COAR type: bachelor thesis
OKM type: G1 Ammattikorkeakoulututkinnon opinnäytetyö, kandidaatintyö
Thesis level: AMK-opinnäytetyö
---
Generated metadata:
Author: Eklund, Emma
Author: Marjanen, Emma
Organization: Högskolan på Åland
Issued: 2021
URN: URN:NBN:fi-fe2021051922434
Language: swe
Publisher: Högskolan på Åland
ISSN (online): 1458-1531
Deg

In [21]:
%%time

import os.path

for test_file in dataset_test_files:
    output_file = "gpt3-" + os.path.basename(test_file)
    print(f"generating metadata for {test_file} into {output_file}")
    nrec = 0
    with open(test_file) as infile, open(output_file, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            generated_metadata = get_completions(rec["text"])
            outrec = {"id": rec["id"], "url": rec["url"], "metadata_orig": rec["metadata"], "metadata_gen": generated_metadata}
            json.dump(outrec, outfile)
            outfile.write("\n")
            nrec += 1
    print(f"completed {nrec} records")
    print()

generating metadata for ../../llm-dataset/mono-swe-test.jsonl into gpt3-mono-swe-test.jsonl
completed 8 records

generating metadata for ../../llm-dataset/docthes-swe-test.jsonl into gpt3-docthes-swe-test.jsonl
completed 5 records

generating metadata for ../../llm-dataset/thes-eng-test.jsonl into gpt3-thes-eng-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/thes-swe-test.jsonl into gpt3-thes-swe-test.jsonl
completed 16 records

generating metadata for ../../llm-dataset/thes-fin-test.jsonl into gpt3-thes-fin-test.jsonl
completed 21 records

generating metadata for ../../llm-dataset/docthes-fin-test.jsonl into gpt3-docthes-fin-test.jsonl
completed 9 records

generating metadata for ../../llm-dataset/serial-fin-test.jsonl into gpt3-serial-fin-test.jsonl
completed 18 records

generating metadata for ../../llm-dataset/serial-swe-test.jsonl into gpt3-serial-swe-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/mono-fin-test.jsonl into gpt3

In [23]:
# Calculate rough similarity between original and generated metadata using Levenshtein normalized indel similarity

import pandas as pd
import Levenshtein

data = []
for test_file in dataset_test_files:
    gen_file = "gpt3-" + os.path.basename(test_file)
    _, subset, lang, _ = os.path.splitext(gen_file)[0].split('-')
    with open(gen_file) as gfile:
        for line in gfile:
            rec = json.loads(line)
            similarity = Levenshtein.ratio(rec["metadata_orig"], rec["metadata_gen"])
            data.append([subset, lang, rec["id"], rec["url"], similarity])

df = pd.DataFrame(data, columns=["subset", "lang", "id", "url", "similarity"])

print("Overall similarity:", df["similarity"].mean())

Overall similarity: 0.721646225749584


In [24]:
df.groupby(['subset'])["similarity"].mean()

subset
docthes    0.778195
mono       0.655294
serial     0.633551
thes       0.823571
Name: similarity, dtype: float64

In [25]:
df.groupby(['lang'])["similarity"].mean()

lang
eng    0.671907
fin    0.743314
swe    0.757140
Name: similarity, dtype: float64

In [26]:
df.groupby(['subset','lang'])["similarity"].mean()

subset   lang
docthes  eng     0.786059
         fin     0.792288
         swe     0.729233
mono     eng     0.645852
         fin     0.637751
         swe     0.707916
serial   eng     0.526967
         fin     0.703911
         swe     0.672509
thes     eng     0.749791
         fin     0.841554
         swe     0.864525
Name: similarity, dtype: float64