# Fine-Tuning GPT-3.5-turbo

- Finetuning GPT-3.5-turbo with **features as text** and **all-in-one strategy**.

- Here, the argument component (AC) and its sentence are given as features.

- Data files: `data_train_v1.jsonl`, `data_val_v1.jsonl`, `data_test_v1.jsonl`

## Libraries

In [3]:
# !pip install --upgrade pip
# !pip install openai

In [132]:
import os
import json
import pickle
import pandas as pd
from pathlib import Path

from sklearn.metrics import classification_report

import openai
from openai import OpenAI

## API key

In [29]:
api_key = "your OpenAI API key"
openai.api_key = api_key

## Upload data to OpenAI

In [30]:
data_dir = os.path.join(os.getcwd(), "data")

In [31]:
train_file_name = os.path.join(data_dir, "data_train_v1.jsonl")
val_file_name = os.path.join(data_dir, "data_val_v1.jsonl")
test_file_name = os.path.join(data_dir, "data_test_v1.jsonl")

In [32]:
client = OpenAI(api_key=api_key)

### Train set

In [36]:
train_upload_response = client.files.create(
    file = Path(train_file_name),
    purpose = "fine-tune"
)

In [37]:
train_upload_response

FileObject(id='file-yZ4zxtjZfiw8nxEjxQGGS61z', bytes=2398726, created_at=1703783652, filename='data_train_v1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [38]:
train_file_id = train_upload_response.id
train_file_id

'file-yZ4zxtjZfiw8nxEjxQGGS61z'

### Validation set

In [39]:
val_upload_response = client.files.create(
    file = Path(val_file_name),
    purpose = "fine-tune"
)

In [40]:
val_upload_response

FileObject(id='file-wIpI9iikCT9UMesXMBcPgAvn', bytes=256512, created_at=1703783698, filename='data_val_v1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [41]:
val_file_id = val_upload_response.id
val_file_id

'file-wIpI9iikCT9UMesXMBcPgAvn'

### Test set

In [42]:
test_upload_response = client.files.create(
    file = Path(test_file_name),
    purpose = "fine-tune"
)

In [43]:
test_upload_response

FileObject(id='file-s6zneSO0OW1jyfpY8nvHTKWr', bytes=681700, created_at=1703783739, filename='data_test_v1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [44]:
test_file_id = test_upload_response.id
test_file_id

'file-s6zneSO0OW1jyfpY8nvHTKWr'

## Fine-tune model

In [45]:
# Lauch fine-tuning

finetune_response = client.fine_tuning.jobs.create(
    training_file = train_file_id,
    validation_file = val_file_id,
    model = "gpt-3.5-turbo"
)

In [86]:
print(finetune_response.id)

ftjob-L9sCLmzuc8IYkNHWLEEr3NAk


In [81]:
# Checking progress

finetune_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=finetune_response.id, limit=10)
finetune_events.model_dump()

{'data': [{'id': 'ftevent-Ng6REPWaMa9o5FiCF8YOQs78',
   'created_at': 1703787925,
   'level': 'info',
   'message': 'Step 1601/1634: training loss=0.23, validation loss=0.23',
   'object': 'fine_tuning.job.event',
   'data': {'step': 1601,
    'train_loss': 0.22546571493148804,
    'valid_loss': 0.2277417018495757,
    'train_mean_token_accuracy': 0.8709677457809448,
    'valid_mean_token_accuracy': 0.1724137931034483},
   'type': 'metrics'},
  {'id': 'ftevent-veFkFMn5wlMNh8PJPzqSqiE1',
   'created_at': 1703787677,
   'level': 'info',
   'message': 'Step 1501/1634: training loss=0.12, validation loss=0.12',
   'object': 'fine_tuning.job.event',
   'data': {'step': 1501,
    'train_loss': 0.11730150878429413,
    'valid_loss': 0.12349657858571698,
    'train_mean_token_accuracy': 0.9677419066429138,
    'valid_mean_token_accuracy': 0.22580645161290322},
   'type': 'metrics'},
  {'id': 'ftevent-MNEdG8XGZGvg3uJxuvB6KzVV',
   'created_at': 1703787433,
   'level': 'info',
   'message': 'Ste

In [83]:
retrieve_response = client.fine_tuning.jobs.retrieve(finetune_response.id)
retrieve_response.model_dump()

{'id': 'ftjob-L9sCLmzuc8IYkNHWLEEr3NAk',
 'created_at': 1703783842,
 'error': None,
 'fine_tuned_model': 'ft:gpt-3.5-turbo-0613:personal::8apNjlMW',
 'finished_at': 1703788011,
 'hyperparameters': {'n_epochs': 3,
  'batch_size': 8,
  'learning_rate_multiplier': 2},
 'model': 'gpt-3.5-turbo-0613',
 'object': 'fine_tuning.job',
 'organization_id': 'org-ao865HLfwm7KSarTu10iG90O',
 'result_files': ['file-GnmMzFb5hptz1pmkyPLrwoa4'],
 'status': 'succeeded',
 'trained_tokens': 1316811,
 'training_file': 'file-yZ4zxtjZfiw8nxEjxQGGS61z',
 'validation_file': 'file-wIpI9iikCT9UMesXMBcPgAvn'}

## Save fine-tuned model

In [95]:
# Option 3

if retrieve_response.fine_tuned_model == None:
    
    finetuned_model = client.fine_tuning.jobs.retrieve("<JOB ID from API>").fine_tuned_model

else:
    
    finetuned_model = retrieve_response.model_dump()["fine_tuned_model"]

In [97]:
finetuned_model

'ft:gpt-3.5-turbo-0613:personal::8apNjlMW'

## Evaluate on test set

In [125]:
# Evaluate on 1 sample

response = client.chat.completions.create(
    model=finetuned_model,
    messages=[{"role": "system", "content": "### You are an expert in linguistics and you will classify an arguement component into three possible classes: major claim, claim, or premise.\n"}, {"role": "user", "content": "### Here is an argument component given in quotation marks: \"it is necessary for universities to respect the individual choice of subject due to the diversity of boys and girls, and we ca not forcedly put the same numbers of males and females into every subject\"\nIs this argument compoment a major claim, a claim, or a premise? No other answer besides these three is accepted.\n    "}, {"role": "assistant", "content": ""}]
)

print(response.choices[0].message.content)

premise


In [127]:
# Evaluate the whole test set

predictions_l = []

with open(os.path.join(data_dir, "data_test_v1.jsonl"), 'r') as fh:
    
    lines_l = [json.loads(line) for line in fh]
    
    for line in lines_l:
        
        messages = line["messages"]
        
        response = client.chat.completions.create(
        model=finetuned_model,
        messages=messages
        )

        predictions_l.append(response.choices[0].message.content)

In [130]:
len(predictions_l)

1266

## Results

In [133]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [139]:
grounds_l = list(df[df.split == 'TEST'].label.values)

In [142]:
# grounds_l

In [146]:
grounds_l = list(map(lambda x: x.replace("MajorClaim", "major claim"), grounds_l))
grounds_l = list(map(lambda x: x.replace("Claim", "claim"), grounds_l))
grounds_l = list(map(lambda x: x.replace("Premise", "premise"), grounds_l))

In [149]:
print(classification_report(grounds_l, predictions_l, digits=3))

              precision    recall  f1-score   support

       claim      0.383     0.454     0.416       304
 major claim      0.564     0.490     0.524       153
     premise      0.814     0.778     0.795       809

    accuracy                          0.665      1266
   macro avg      0.587     0.574     0.578      1266
weighted avg      0.680     0.665     0.671      1266

