In [7]:
import pandas as pd

prompts = pd.read_json('fn1.7-test-prompts.jsonl', lines=True)

In [8]:
def make_batch(row, model, temperature):
    # Row: input_sentence	frame_name	frame_elements	prompt	output
    row_id = f"request-{row.name}"
    method = "POST"
    url = "/v1/chat/completions"
    body = {
        "model": model,
        "messages": row.messages[:-1],
        "temperature": temperature,
    }

    return {
        "custom_id": row_id,
        "method": method,
        "url": url,
        "body": body
    }

MODEL_NAME = "ft:gpt-4o-mini-2024-07-18:idir-lab:fn1-7-json-existing-smalltrain-mostfes:AVqBDgAW"
PROMPT_NAME = "json-existing"
TEMPERATURE = 0.0

file_name = f"batch-{MODEL_NAME}-{TEMPERATURE}temperature-{PROMPT_NAME}"

batch_prompts = prompts.apply(lambda x: make_batch(x, MODEL_NAME, TEMPERATURE), axis=1)
pd.DataFrame(batch_prompts)[0].to_json(f"{file_name}.jsonl", orient="records", lines=True)

In [9]:
# Upload batch and run the model
import openai

client = openai.Client(api_key='sk-proj-')

batch_input_file = client.files.create(file=open(f"{file_name}.jsonl", 'rb'), purpose='batch')

request = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": f"Batch for FSP on FrameNet1.7 test set. Model: {MODEL_NAME}, Temperature: {TEMPERATURE}, Prompt: {PROMPT_NAME}"
    }
)

In [10]:
import datetime
dts = datetime.datetime.fromtimestamp(request.created_at).strftime("%Y%m%d-%H%M%S")

with open(f"{file_name}.id", "w") as f:
    f.write(request.id)

## Once the batch has finished...

In [11]:
import openai
import os

with open(f"{file_name}.id", "r") as f:
    request_id = f.read()

client = openai.Client(api_key='sk-proj-')

request = client.batches.retrieve(request_id)

if not os.path.exists(f"{file_name}-predictions.jsonl"):
    output_file = client.files.retrieve(request.output_file_id)
    output_file_content = client.files.content(output_file.id)

    with open(f"{file_name}-predictions.jsonl", "wb") as f:
        f.write(output_file_content.content)

KeyboardInterrupt: 

In [None]:
# Extract JSON from each line
import re
import json

def clean_output(output):
    cleaned_output = output
    # Extract XML code from the output
    xml_code = re.search(r'```json(.*?)```', output, re.DOTALL)
    cleaned_output = ''
    if xml_code:
        cleaned_output = xml_code.group(1).strip()
    
    # Parse the JSON
    try:
        # cleaned_output = json.loads(cleaned_output)
        cleaned_output = eval(cleaned_output)
    except:
        return {}
    
    return cleaned_output

lines = pd.read_json(f"{file_name}-predictions.jsonl", lines=True)
cleaned_lines = lines.response.apply(lambda x: clean_output(x['body']['choices'][0]['message']['content']))
# cleaned_lines = [clean_output(line) for line in lines]

cleaned_lines.to_json(f"{file_name}-predictions-cleaned.jsonl", orient="records", lines=True)

In [None]:
lines.response[0]['body']['choices'][0]['message']['content']

"### Output: \n```json\n{'Unit': 'December', 'Count': '1998'}\n```\n"