# Test the connection and API key

Make sure it's possible to use the OpenAI API. For this to work, the environment variable OPENAI_API_KEY must be set to a valid API key which has available credits.

In [11]:
import openai
import os

# read the OpenAI API key from an environment variable
openai.api_key = os.environ['OPENAI_API_KEY']

# test the API connection by making a simple request
response = openai.completions.create(model="davinci-002", prompt="Say this is a test", temperature=0, max_tokens=7)
print(response)
print(response.choices[0].text)

Completion(id='cmpl-8XQApyWlv7XL6cUQIo5Em4xskWmff', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=". I'm gonna say this is")], created=1702976127, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=7, prompt_tokens=5, total_tokens=12))
. I'm gonna say this is


# Prepare the fine-tuning set

Prepare a fine-tuning dataset and use it to fine-tune a GPT3 model.

In [12]:
import glob
import json
import tiktoken

PROMPT_SUFFIX = '\n\n###\n\n'
COMPLETION_STOP = '\n###'
TRAINFILE = 'fine-tune.jsonl'
VALIDATEFILE = 'validate.jsonl'
BASE_MODEL = 'babbage-002'
MAX_TOKENS = 12000  # Increased for babbage-002 model

dataset_train_files = glob.glob("../../llm-dataset/*-train.jsonl")
dataset_test_files = glob.glob("../../llm-dataset/*-test.jsonl")

encoding = tiktoken.encoding_for_model(BASE_MODEL)

def truncate_text(text):
    """truncate text so it contains at most MAX_TOKENS according to the OpenAI tokenizer"""
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:MAX_TOKENS])

def create_sample(text, metadata):
    """create a fine-tuning sample from text and metadata about a single document"""
    return {'prompt': truncate_text(text) + PROMPT_SUFFIX,
            'completion': " " + metadata + COMPLETION_STOP}

def convert_to_samples(infiles, outfile):
    print(f"Creating {outfile}")
    nrec = 0
    with open(outfile, "w") as outf:
        for infile in infiles:
            print(f"- processing {infile}")
            with open(infile) as inf:
                for line in inf:
                    rec = json.loads(line)
                    sample = create_sample(rec["text"], rec["metadata"])
                    print(json.dumps(sample), file=outf)
                    nrec += 1
    print(f"{nrec} records converted")
    print()

convert_to_samples(dataset_train_files, TRAINFILE)
convert_to_samples(dataset_test_files, VALIDATEFILE)

Creating fine-tune.jsonl
- processing ../../llm-dataset/serial-fin-train.jsonl
- processing ../../llm-dataset/serial-swe-train.jsonl
- processing ../../llm-dataset/docthes-eng-train.jsonl
- processing ../../llm-dataset/docthes-fin-train.jsonl
- processing ../../llm-dataset/thes-fin-train.jsonl
- processing ../../llm-dataset/thes-swe-train.jsonl
- processing ../../llm-dataset/mono-swe-train.jsonl
- processing ../../llm-dataset/docthes-swe-train.jsonl
- processing ../../llm-dataset/mono-eng-train.jsonl
- processing ../../llm-dataset/mono-fin-train.jsonl
- processing ../../llm-dataset/thes-eng-train.jsonl
- processing ../../llm-dataset/serial-eng-train.jsonl
557 records converted

Creating validate.jsonl
- processing ../../llm-dataset/mono-swe-test.jsonl
- processing ../../llm-dataset/docthes-swe-test.jsonl
- processing ../../llm-dataset/thes-eng-test.jsonl
- processing ../../llm-dataset/thes-swe-test.jsonl
- processing ../../llm-dataset/thes-fin-test.jsonl
- processing ../../llm-dataset/

In [9]:
# Check that the fine-tuning data set is OK using the prepare_data tool.
# We will only use prepare_data as a validation aid and delete the "prepared"
# files that it helpfully creates.
!openai tools fine_tunes.prepare_data -f fine-tune.jsonl -q
!rm -f fine-tune_prepared.jsonl

!openai tools fine_tunes.prepare_data -f validate.jsonl -q
!rm -f validate_prepared.jsonl

Analyzing...

- Your file contains 557 prompt-completion pairs
- There are 7 examples that are very long. These are rows: [231, 257, 258, 268, 333, 468, 504]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- All prompts end with suffix `\n\n###\n\n`
- All completions end with suffix `\n###`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 7 long examples [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified file to `fine-tune_prepared.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "fine-tune_prepared.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\n\n###\n\n` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=["\n###"]` so that the generated texts ends at the expected place.
Once your m

In [10]:
# OpenAI API and client have changed, now finetuning can or needs to be done
# with Python code, not CLI client

# Upload training data

upload_response = openai.files.create(
    file=open(TRAINFILE, "rb"),
    purpose="fine-tune"
)
trainfile_id = upload_response.id
upload_response

FileObject(id='file-tiZAgq70qPmQa7KDYHFP5Bzg', bytes=2970399, created_at=1701947205, filename='fine-tune.jsonl', object='file', purpose='fine-tune', status='uploaded', status_details=None)

In [11]:
# Perform the actual finetuning via the API. This can take a while, there can be a long queue.

openai.fine_tuning.jobs.create(
    training_file=trainfile_id,
    model="babbage-002"
)

FineTuningJob(id='ftjob-f3Plu07lCRlZlZYaWjCiWKLR', created_at=1701947281, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-5QEUW2DacClOLTNQvTEKMHdV', result_files=[], status='validating_files', trained_tokens=None, training_file='file-tiZAgq70qPmQa7KDYHFP5Bzg', validation_file=None)

In [14]:
fine_tuning_job_id = openai.fine_tuning.jobs.list(limit=10).data[0].id
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tuning_job_id, limit=20).data

[FineTuningJobEvent(id='ftevent-uZAdFSd2qHjZsgf2DOe4LwN9', created_at=1701949004, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-ChGZH2rgFtkoibq4ygIhNu5G', created_at=1701949002, level='info', message='New fine-tuned model created: ft:babbage-002:personal::8T6yHGdp', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-5w8o0kPU98fSSZ6BHZXWU3HK', created_at=1701948982, level='info', message='Step 1601/1671: training loss=0.47', object='fine_tuning.job.event', data={'step': 1601, 'train_loss': 0.47158852219581604, 'train_mean_token_accuracy': 0.8634920716285706}, type='metrics'),
 FineTuningJobEvent(id='ftevent-v36VuJqsnsZHi7MkVEZctCPA', created_at=1701948963, level='info', message='Step 1501/1671: training loss=0.22', object='fine_tuning.job.event', data={'step': 1501, 'train_loss': 0.2230607271194458, 'train_mean_token_accuracy': 0.9497487545013428

In [16]:
# store the model name from above fine tuning job

model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_id).fine_tuned_model
model_name

'ft:babbage-002:personal::8T6yHGdp'

In [15]:
# Try out the fine-tuned model on a random test set record

import random

model_name = 'ft:babbage-002:personal::8T6yHGdp'

def get_completions(text):
    response = openai.completions.create(
                                    model=model_name,
                                    prompt=truncate_text(text) + PROMPT_SUFFIX,
                                    temperature=0,  # no fooling around!
                                    max_tokens=2048, # should be very plenty
                                    stop=[COMPLETION_STOP])  # stop at ###
    return response.choices[0].text.strip()

test_set_file = random.choice(dataset_test_files)
with open(test_set_file) as testfile:
    records = [json.loads(line) for line in testfile]
rec = random.choice(records)

print(f"Testing on {rec['id']} with PDF {rec['url']}")
print("---")
print("Curated metadata:")
print(rec["metadata"])
print("---")
print("Generated metadata:")
print(get_completions(rec["text"]))


Testing on https://www.julkari.fi/handle/10024/143683 with PDF https://www.julkari.fi/bitstream/handle/10024/143683/THL_infograafisarja_aluevaalit_170122_norra_osterbotten.pdf
---
Curated metadata:
Org. unit: THL
Issued: 2022
URN: URN:NBN:fi-fe202201132199
Language: swe
Publisher: Institutet för hälsa och välfärd THL = Terveyden ja hyvinvoinnin laitos
Title: Norra Österbotten och ojämlikhet : infograf åt välfärdsområdesfullmäktige
Alternative title: Pohjois-Pohjanmaa ja eriarvoisuus : infograafi aluevaltuutetulle
COAR type: research report
OKM type: D4 Julkaistu kehittämis- tai tutkimusraportti taikka -selvitys
---
Generated metadata:
Organization: THL
Issued: 2020
URN: URN:NBN:fi-fe2020110925364
Language: swe
Publisher: THL
ISSN (online): 2343-8464
Series name: Hälsogranskningar av de arbetslösa
Title: Hälsogranskningar av de arbetslösa i Norra Österbotten 2018-2019
COAR type: research report
OKM type: D4 Julkaistu kehittämis- tai tutkimusraportti taikka -selvitys


In [16]:
%%time

import os.path

for test_file in dataset_test_files:
    output_file = "gpt3-" + os.path.basename(test_file)
    print(f"generating metadata for {test_file} into {output_file}")
    nrec = 0
    with open(test_file) as infile, open(output_file, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            generated_metadata = get_completions(rec["text"])
            outrec = {"id": rec["id"], "url": rec["url"], "ground_truth": rec["metadata"], "prediction": generated_metadata}
            json.dump(outrec, outfile)
            outfile.write("\n")
            nrec += 1
    print(f"completed {nrec} records")
    print()

generating metadata for ../../llm-dataset/mono-swe-test.jsonl into gpt3-mono-swe-test.jsonl
completed 8 records

generating metadata for ../../llm-dataset/docthes-swe-test.jsonl into gpt3-docthes-swe-test.jsonl
completed 5 records

generating metadata for ../../llm-dataset/thes-eng-test.jsonl into gpt3-thes-eng-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/thes-swe-test.jsonl into gpt3-thes-swe-test.jsonl
completed 16 records

generating metadata for ../../llm-dataset/thes-fin-test.jsonl into gpt3-thes-fin-test.jsonl
completed 21 records

generating metadata for ../../llm-dataset/docthes-fin-test.jsonl into gpt3-docthes-fin-test.jsonl
completed 9 records

generating metadata for ../../llm-dataset/serial-fin-test.jsonl into gpt3-serial-fin-test.jsonl
completed 18 records

generating metadata for ../../llm-dataset/serial-swe-test.jsonl into gpt3-serial-swe-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/mono-fin-test.jsonl into gpt3

In [64]:
# Convert the results FinGreyLit dataschema

from glob import glob
import json


records =[]
prediction_records_files = glob("gpt3-*.jsonl")

KEYS_MAP = {
    "Contributor": "dc.contributor",
    "Author": "dc.contributor.author",
    "Supervisor": "dc.contributor.degreeSupervisor",
    "Department": "dc.contributor.department",
    "Faculty": "dc.contributor.faculty",
    "Organization": "dc.contributor.organization",
    "Opponent": "dc.contributor.opponent",
    "Reviewer": "dc.contributor.reviewer",
    "Editor": "dc.contributor.editor",
    "Org. unit": "dc.contributor.orgunit",
    "Issued": "dc.date.issued",
    "ISBN (printed)": "dc.identifier.isbn",
    "ISBN (online)": "dc.identifier.isbn",
    "ISSN (printed)": "dc.relation.pissn",
    "ISSN (online)": "dc.relation.eissn",
    "Volume": "dc.relation.volume",
    "Issue": "dc.relation.issue",
    "Journal name": "dc.relation.ispartofjournal",
    "Series name": "dc.relation.ispartofseries",
    "Number in series": "dc.relation.numberinseries",
    "Series year": "dc.series.year",
    "DOI": "dc.doi",
    "URN": "dc.identifier.urn",
    "Language": "dc.language.iso",
    "Publisher": "dc.publisher",
    "Publisher (online)": "dc.publisher",  # TODO Missing from schme?
    "COAR type": "dc.type.coar",
    "OKM type": "dc.type.okm",
    "Thesis level": "dc.type.ontasot",
    "Discipline": "dc.subject.discipline",
    "Degree program": "dc.subject.degreeprogram",
    "Title": "dc.title",
    "Alternative title": "dc.title.alternative",
    "Contractor": "dc.relation.contractor",
    "Page range": "dc.format.pagerange",
}

LIST_FIELDS = [
    "dc.contributor.author",
    "dc.identifier.isbn",
    "dc.publisher",
]

def convert_to_scheme(metadata_str):
    field_lines = metadata_str.split('\n')
    out = {}

    for fl in field_lines:
        try:
            key, value = fl.split(":", maxsplit=1)
            dc_key = KEYS_MAP[key.strip()]
        except (KeyError, ValueError):
            print(f"Invalid line: {fl}")
            continue
        value = value.strip()
        if dc_key in LIST_FIELDS:
            if not dc_key in out:
                out[dc_key] = []
            out[dc_key].append(value)
        else:
            out[dc_key] = value
    return out

prediction_records = []
for rec_file in prediction_records_files:
    print(rec_file)
    doctype = rec_file.split("-")[1]
    with open(rec_file, "rt") as rf:
        for line in rf:
            rec_in = json.loads(line)
            rec_out = {
                "rowid": rec_in["id"],
                "url": rec_in["url"],
                "doctype": doctype,
                }
            rec_out["ground_truth"] = convert_to_scheme(rec_in["ground_truth"])
            rec_out["prediction"] = convert_to_scheme(rec_in["prediction"])
            prediction_records.append(rec_out)

gpt3-thes-swe-test.jsonl
gpt3-mono-eng-test.jsonl
Invalid line: The following is a list of the 2017-2018 winners of the National Book Awards. The winners will be announced on April 25, 2018, at a ceremony in New York City. The National Book Foundation is a private, nonprofit organization that promotes the reading public and the creation and appreciation of books. The Foundation is based in New York City and was founded in 1937 by Aldo Leopold, author of A Sand County Almanac. The Foundation’s mission is to recognize and honor the best books of the year and to foster the creation and appreciation of books. The Foundation is supported by the National Endowment for the Arts, the National Endowment for the Humanities, and the John S. and James L. Knight Foundation. The Foundation also receives generous support from the City of New York, the New York State Council on the Arts with the support of Governor Andrew M. Cuomo and the New York State Legislature, and the National Endowment for the 

In [65]:
# Analyze the extracted metadata

import pandas as pd
import sys
from glob import glob
sys.path.append('..')
from eval import MetadataEvaluator

# prediction_records_files = glob('*.jsonl')

evaluator = MetadataEvaluator()  # TODO Pass records here
results = evaluator.evaluate_records(prediction_records)

df = pd.DataFrame(results)
df.head()

Unnamed: 0,rowid,language,field,predicted_val,true_val,match_type,score
0,https://www.doria.fi/handle/10024/186483,swe,dc.contributor.author,"[Holmström, Ellinoora]","[Holmström, Ellinoora]",exact,1
1,https://www.doria.fi/handle/10024/186483,swe,dc.contributor.faculty,Kemi- och processteknik,Fakulteten för naturvetenskaper och teknik,wrong,0
2,https://www.doria.fi/handle/10024/186483,swe,dc.contributor.organization,Åbo Akademi,Åbo Akademi,exact,1
3,https://www.doria.fi/handle/10024/186483,swe,dc.date.issued,2023-01-31,2023,superset,1
4,https://www.doria.fi/handle/10024/186483,swe,dc.identifier.urn,URN:NBN:fi-fe202302011465,URN:NBN:fi-fe202301102207,wrong,0


In [68]:
df.groupby(['language','field'])['score'].mean().mean()

0.4329723876242285

In [58]:
df.groupby(['language','field'])['score'].agg(['mean', 'size'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
language,field,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,dc.contributor,0.333333,3
eng,dc.contributor.author,0.763636,55
eng,dc.contributor.degreeSupervisor,0.125,16
eng,dc.contributor.department,0.333333,6
eng,dc.contributor.editor,0.142857,7
eng,dc.contributor.faculty,0.3125,32
eng,dc.contributor.opponent,0.4375,16
eng,dc.contributor.organization,0.666667,45
eng,dc.contributor.reviewer,0.071429,14
eng,dc.date.issued,0.9,50


In [59]:
# Save results statistics in a md file

results_table_file = 'results-gpt3-' + model_name.split(":")[1] + '.md'
with open(results_table_file, "wt") as ofile:
    print(
        df.groupby(['language','field'])['score']
            .agg(['mean', 'size'])
            .reset_index()
            .rename(columns={'idx1': '', 'idx2': ''})
            .to_markdown(tablefmt='github', index=False),
        file=ofile
    )