# Test the connection and API key

Make sure it's possible to use the OpenAI API. For this to work, the environment variable OPENAI_API_KEY must be set to a valid API key which has available credits.

In [1]:
import openai
import os

# read the OpenAI API key from an environment variable
openai.api_key = os.environ['OPENAI_API_KEY']

# test the API connection by making a simple request
response = openai.completions.create(model="davinci-002", prompt="Say this is a test", temperature=0, max_tokens=7)
print(response)
print(response.choices[0].text)

Completion(id='cmpl-8ideGgrQvNxafrKjn4cXgAUFBRdVq', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=". I'm gonna say this is")], created=1705649532, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=7, prompt_tokens=5, total_tokens=12))
. I'm gonna say this is


# Prepare the fine-tuning set

Prepare a fine-tuning dataset and use it to fine-tune a GPT3 model.

In [2]:
import glob
import json
import tiktoken

PROMPT_SUFFIX = '\n\n###\n\n'
COMPLETION_STOP = '\n###'
TRAINFILE = 'fine-tune.jsonl'
VALIDATEFILE = 'validate.jsonl'
BASE_MODEL = 'babbage-002'
MAX_TOKENS = 12000  # Increased for babbage-002 model

dataset_train_files = glob.glob("../../llm-dataset/*-train.jsonl")
dataset_test_files = glob.glob("../../llm-dataset/*-test.jsonl")

encoding = tiktoken.encoding_for_model(BASE_MODEL)

def truncate_text(text):
    """truncate text so it contains at most MAX_TOKENS according to the OpenAI tokenizer"""
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:MAX_TOKENS])

def create_sample(text, metadata):
    """create a fine-tuning sample from text and metadata about a single document"""
    return {'prompt': truncate_text(text) + PROMPT_SUFFIX,
            'completion': " " + metadata + COMPLETION_STOP}

def convert_to_samples(infiles, outfile):
    print(f"Creating {outfile}")
    nrec = 0
    with open(outfile, "w") as outf:
        for infile in infiles:
            print(f"- processing {infile}")
            with open(infile) as inf:
                for line in inf:
                    rec = json.loads(line)
                    sample = create_sample(rec["text"], rec["metadata"])
                    print(json.dumps(sample), file=outf)
                    nrec += 1
    print(f"{nrec} records converted")
    print()

convert_to_samples(dataset_train_files, TRAINFILE)
convert_to_samples(dataset_test_files, VALIDATEFILE)

Creating fine-tune.jsonl
- processing ../../llm-dataset/serial-fin-train.jsonl
- processing ../../llm-dataset/serial-swe-train.jsonl
- processing ../../llm-dataset/docthes-eng-train.jsonl
- processing ../../llm-dataset/docthes-fin-train.jsonl
- processing ../../llm-dataset/thes-fin-train.jsonl
- processing ../../llm-dataset/thes-swe-train.jsonl
- processing ../../llm-dataset/mono-swe-train.jsonl
- processing ../../llm-dataset/docthes-swe-train.jsonl
- processing ../../llm-dataset/mono-eng-train.jsonl
- processing ../../llm-dataset/mono-fin-train.jsonl
- processing ../../llm-dataset/thes-eng-train.jsonl
- processing ../../llm-dataset/serial-eng-train.jsonl
557 records converted

Creating validate.jsonl
- processing ../../llm-dataset/mono-swe-test.jsonl
- processing ../../llm-dataset/docthes-swe-test.jsonl
- processing ../../llm-dataset/thes-eng-test.jsonl
- processing ../../llm-dataset/thes-swe-test.jsonl
- processing ../../llm-dataset/thes-fin-test.jsonl
- processing ../../llm-dataset/

In [3]:
# Check that the fine-tuning data set is OK using the prepare_data tool.
# We will only use prepare_data as a validation aid and delete the "prepared"
# files that it helpfully creates.
!openai tools fine_tunes.prepare_data -f fine-tune.jsonl -q
!rm -f fine-tune_prepared.jsonl

!openai tools fine_tunes.prepare_data -f validate.jsonl -q
!rm -f validate_prepared.jsonl

Analyzing...

- Your file contains 557 prompt-completion pairs
- There are 3 examples that are very long. These are rows: [56, 537, 554]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- All prompts end with suffix `\n\n###\n\n`
- All completions end with suffix `\n###`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 3 long examples [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified file to `fine-tune_prepared.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "fine-tune_prepared.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\n\n###\n\n` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=["\n###"]` so that the generated texts ends at the expected place.
Once your model starts training,

In [4]:
# OpenAI API and client have changed, now finetuning can or needs to be done
# with Python code, not CLI client

# Upload training data

upload_response = openai.files.create(
    file=open(TRAINFILE, "rb"),
    purpose="fine-tune"
)
trainfile_id = upload_response.id
upload_response

FileObject(id='file-jzH6x5v6WiDWLwV4g5NytVvC', bytes=1986999, created_at=1705649540, filename='fine-tune.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [5]:
# Perform the actual finetuning via the API. This can take a while, there can be a long queue.

openai.fine_tuning.jobs.create(
    training_file=trainfile_id,
    model="babbage-002"
)

FineTuningJob(id='ftjob-75HiJ3ZiIr6Bs3pB9mBjT4yF', created_at=1705649541, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-5QEUW2DacClOLTNQvTEKMHdV', result_files=[], status='validating_files', trained_tokens=None, training_file='file-jzH6x5v6WiDWLwV4g5NytVvC', validation_file=None)

In [6]:
fine_tuning_job_id = openai.fine_tuning.jobs.list(limit=10).data[0].id
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tuning_job_id, limit=20).data

[FineTuningJobEvent(id='ftevent-oVbw5gtgwjysFygEQt9LmOFi', created_at=1705649541, level='info', message='Validating training file: file-jzH6x5v6WiDWLwV4g5NytVvC', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-eHe74rlvIOC0Lzk8ybrBNr3A', created_at=1705649541, level='info', message='Created fine-tuning job: ftjob-75HiJ3ZiIr6Bs3pB9mBjT4yF', object='fine_tuning.job.event', data={}, type='message')]

In [10]:
# store the model name from above fine tuning job

model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_id).fine_tuned_model
model_name

'ft:babbage-002:personal::8idlFmw6'

In [11]:
# Try out the fine-tuned model on a random test set record

import random

model_name = 'ft:babbage-002:personal::8idlFmw6'

def get_completions(text):
    response = openai.completions.create(
                                    model=model_name,
                                    prompt=truncate_text(text) + PROMPT_SUFFIX,
                                    temperature=0,  # no fooling around!
                                    max_tokens=2048, # should be very plenty
                                    stop=[COMPLETION_STOP])  # stop at ###
    return response.choices[0].text.strip()

test_set_file = random.choice(dataset_test_files)
with open(test_set_file) as testfile:
    records = [json.loads(line) for line in testfile]
rec = random.choice(records)

print(f"Testing on {rec['id']} with PDF {rec['url']}")
print("---")
print("Curated metadata:")
print(rec["metadata"])
print("---")
print("Generated metadata:")
print(get_completions(rec["text"]))


Testing on https://trepo.tuni.fi/handle/10024/125025 with PDF https://trepo.tuni.fi/bitstream/handle/10024/125025/978-952-03-1880-2.pdf
---
Curated metadata:
Author: Mantula, Paula
Supervisor: Docent Satu Mäkelä, Tampere University
Supervisor: Professor Emeritus Jukka Mustonen, Tampere University
Faculty: Lääketieteen ja terveysteknologian tiedekunta - Faculty of Medicine and Health Technology
Opponent: Professor Risto Tertti, University of Turku
Organization: Tampere University
Reviewer: Docent Mari Kanerva , University of Helsinki
Reviewer: Docent Risto Ikäheimo, University of Oulu
Issued: 2021-04-16
ISBN (online): 978-952-03-1880-2
URN: URN:ISBN:978-952-03-1880-2
Language: eng
Publisher: Tampere University
ISSN (online): 2490-0028
ISBN (printed): 978­952­03­1879­6
Series name: Tampere University Dissertations - Tampereen yliopiston väitöskirjat
Number in series: 385
ISSN (printed): 2489-9860
Degree program: Lääketieteen, biotieteiden ja biolääketieteen tekniikan tohtoriohjelma - Doc

Author: Mantula, Paula
Supervisor: Docent Satu Mäkelä, Tampere University
Faculty: Lääketieteen ja terveysteknologian tiedekunta - Faculty of Medicine and Health Sciences
Opponent: Professor Risto Tertti, University of Turku
Organization: Tampere University
Reviewer: Docent Risto Ikäheimo, Docent Mari Kanerva, University of Oulu
Reviewer: Professor Emeritus Jukka Mustonen, Tampere University
Issued: 2021-03-04
ISBN (online): 978-952-03-1880-2
URN: URN:ISBN:978-952-03-1880-2
Language: eng
Publisher: Tampere University
ISSN (online): 2490-0028
ISBN (printed): 978-952-03-1879-6
Series name: Tampere University Dissertations - Tampereen yliopiston väitöskirjat
Number in series: 385
ISSN (printed): 2489-9860
Degree program: Farmakologian, lääketieteen ja biotieteiden tohtoriohjelma - Doctoral Programme in Medicinal and Clinical Biochemistry
Title: Role of Urinary Findings and Adipokines in Puumala Virus-induced Acute Kidney Injury
COAR type: doctoral thesis
OKM type: G4 Monografiaväitöskirja

In [12]:
%%time

import os.path

for test_file in dataset_test_files:
    output_file = "gpt3-" + os.path.basename(test_file)
    print(f"generating metadata for {test_file} into {output_file}")
    nrec = 0
    with open(test_file) as infile, open(output_file, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            generated_metadata = get_completions(rec["text"])
            outrec = {"id": rec["id"], "url": rec["url"], "ground_truth": rec["metadata"], "prediction": generated_metadata}
            json.dump(outrec, outfile)
            outfile.write("\n")
            nrec += 1
    print(f"completed {nrec} records")
    print()

generating metadata for ../../llm-dataset/mono-swe-test.jsonl into gpt3-mono-swe-test.jsonl
completed 8 records

generating metadata for ../../llm-dataset/docthes-swe-test.jsonl into gpt3-docthes-swe-test.jsonl
completed 5 records

generating metadata for ../../llm-dataset/thes-eng-test.jsonl into gpt3-thes-eng-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/thes-swe-test.jsonl into gpt3-thes-swe-test.jsonl
completed 16 records

generating metadata for ../../llm-dataset/thes-fin-test.jsonl into gpt3-thes-fin-test.jsonl
completed 21 records

generating metadata for ../../llm-dataset/docthes-fin-test.jsonl into gpt3-docthes-fin-test.jsonl
completed 9 records

generating metadata for ../../llm-dataset/serial-fin-test.jsonl into gpt3-serial-fin-test.jsonl
completed 18 records

generating metadata for ../../llm-dataset/serial-swe-test.jsonl into gpt3-serial-swe-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/mono-fin-test.jsonl into gpt3

In [3]:
# Convert the results FinGreyLit dataschema and save to file

from glob import glob
import json


records =[]
prediction_records_files = glob("gpt3-*.jsonl")

KEYS_MAP = {
    "Contributor":		"dc.contributor",
    "Author":		    "dc.contributor.author",
    "Supervisor":		"dc.contributor.degreeSupervisor",
    "Department":		"dc.contributor.department",
    "Editor":		    "dc.contributor.editor",
    "Faculty":		    "dc.contributor.faculty",
    "Opponent":		    "dc.contributor.opponent",
    "Organization":		"dc.contributor.organization",
    "Org. unit":		"dc.contributor.orgunit",
    "Reviewer":		    "dc.contributor.reviewer",
    "Issued":		    "dc.date.issued",
    "extent":		    "dc.format.extent",
    "Page range":		"dc.format.pagerange",
    "ISBN (printed)":	"dc.identifier.isbn",
    "ISBN (online)":	"dc.identifier.isbn",
    "URN":		        "dc.identifier.urn",
    "Language":		    "dc.language.iso",
    "Publisher":		"dc.publisher",
    "Publisher (online)":"dc.publisher",
    "Contractor":		"dc.relation.contractor",
    "DOI":		        "dc.relation.doi",
    "ISSN (online)":	"dc.relation.eissn",
    "risbn":		    "dc.relation.isbn",
    "Journal name":	"reladc.tion.ispartofjournal",
    "Series name":		"dc.relation.ispartofseries",
    "Issue":		    "dc.relation.issue",
    "Number in series":	"dc.relationnumberinseries",
    "ISSN (printed)":	"dc.relation.pissn",
    "Volume":		    "dc.relation.volume",
    "Series year":		"dc.series.year",
    "Degree program":	"dc.subject.degreeprogram",
    "Discipline":		"dc.subject.discipline",
    "Title":		    "dc.title",
    "Alternative title":"dc.title.alternative",
    "COAR type":		"dc.type.coar",
    "OKM type":		    "dc.type.okm",
    "Thesis level":		"dc.type.ontasot",
}

LIST_FIELDS = [
    "dc.contributor.author",
    "dc.identifier.isbn",
    "dc.relation.isbn",
    "dc.publisher",
]

def convert_to_scheme(metadata_str):
    field_lines = metadata_str.split('\n')
    out = dict.fromkeys(KEYS_MAP.values())  # ensure keys exist

    for fl in field_lines:
        try:
            key, value = fl.split(":", maxsplit=1)
            dc_key = KEYS_MAP[key.strip()]
        except (KeyError, ValueError):
            print(f"Invalid line: {fl}")
            continue
        value = value.strip()
        if dc_key in LIST_FIELDS:
            if out[dc_key] is None:
                out[dc_key] = []
            out[dc_key].append(value)
        else:
            out[dc_key] = value
    return out

prediction_records = []
for rec_file in prediction_records_files:
    print(rec_file)
    doctype = rec_file.split("-")[1]
    with open(rec_file, "rt") as rf:
        for line in rf:
            rec_in = json.loads(line)
            rec_out = {
                "rowid": rec_in["id"],
                "url": rec_in["url"],
                "doctype": doctype,
                }
            rec_out["ground_truth"] = convert_to_scheme(rec_in["ground_truth"])
            rec_out["prediction"] = convert_to_scheme(rec_in["prediction"])
            prediction_records.append(rec_out)


# write output to JSONL file
with open('test-records.jsonl', 'w') as outfile:
    for rec in prediction_records:
        json.dump(rec, outfile)
        outfile.write("\n")


gpt3-thes-swe-test.jsonl
Invalid line: From the interviews and risk management studies it could be concluded that the key
Invalid line: Key words: COSO, risk, risk management, Enterprise Risk Management
Invalid line: URN (online): URN:NBN:fi-fe2021051630480
Invalid line: Publisher version: https://www.abo.fi/education/en/research/publications/2021/05/forskningsraportti-risk-management-in-a-company-is-more-proactive-and-has-a-good-overall-view-of-both-negative-events-and-potential-opportunities-there-are-several-international-risk-management-frameworks-and-international-risk-management-standards-that-have-helped-organizations-to-develop-and-implement-an-effective-risk-management-one-of-these-is-the-framework-on-the-corporate-level-risk-management-issued-by-the-committee-of-sponsoring-organizations-of-the-treadway-commission-coso-the-aim-of-this-thesis-was-to-investigate-how-the-connection-between-coso-s-normative-risk-management-frameworks-can-be-expressed-in-practice-and-if-the-framewo

In [1]:
# Analyze the statistics of the extracted metadata and save to file
# model_name = 'ft:babbage-002:personal::8idlFmw6'

import sys
sys.path.append('..')
from eval import MetadataEvaluator

evaluator = MetadataEvaluator('test-records.jsonl')
results = evaluator.evaluate_records() #prediction_records[:9])
# Use only the fields that Meteor extracts
fields = [
        "dc.contributor.author",
        "dc.date.issued",
        "dc.identifier.isbn",
        "dc.language.iso",
        "dc.publisher",
        "dc.relation.eissn",
        "dc.title",
    ]
statistics_filename = '../results-openai-gpt3-api-' + model_name + '.md'
evaluator.save_md(results, statistics_filename, fields)
