In [1]:
import pandas as pd

prompts = pd.read_csv('fn1.7-test-prompts.csv')

In [2]:
def make_batch(row, model, temperature):
    # Row: input_sentence	frame_name	frame_elements	prompt	output
    row_id = f"request-{row.name}"
    method = "POST"
    url = "/v1/chat/completions"
    body = {
        "model": model,
        "messages": [{"role": "system", "content": row.prompt}],
        "temperature": temperature,
    }
    return {
        "custom_id": row_id,
        "method": method,
        "url": url,
        "body": body
    }

MODEL_NAME = "gpt-4o"
PROMPT_NAME = "json-existing"
TEMPERATURE = 0.0

file_name = f"batch-{MODEL_NAME}-{TEMPERATURE}temperature-{PROMPT_NAME}"

batch_prompts = prompts.apply(lambda x: make_batch(x, MODEL_NAME, TEMPERATURE), axis=1)
pd.DataFrame(batch_prompts)[0].to_json(f"{file_name}.jsonl", orient="records", lines=True)

In [3]:
# Upload batch and run the model
import openai

client = openai.Client(api_key='sk-proj-')

batch_input_file = client.files.create(file=open(f"{file_name}.jsonl", 'rb'), purpose='batch')

request = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": f"Batch for FSP on FrameNet1.7 test set. Model: {MODEL_NAME}, Temperature: {TEMPERATURE}, Prompt: {PROMPT_NAME}"
    }
)

In [4]:
import datetime
dts = datetime.datetime.fromtimestamp(request.created_at).strftime("%Y%m%d-%H%M%S")

with open(f"{file_name}.id", "w") as f:
    f.write(request.id)

Bad pipe message: %s [b'(\xce\xa6\xcc\x94\xd3\x8ep3\xdeZ\xb0fv`!T\x05\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00']
Bad pipe message: %s [b'>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00']
Bad pipe message: %s [b"\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xba\x00\xbb\x00\xbc\x00\xbd\x00\xbe\x00\xbf\x00\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x13\x01\x13\x02\x13\x03\x13\x04\x13\x05\xc0\x01\xc0\x02\xc0\x03\xc0\x04\xc0\x05\xc0\x06\xc0\x07\xc0\x08\xc0\t\xc0\n\xc0\x0b\xc0\x0c\xc0\r\xc0\x0e\xc0\x0f\xc0\x10\xc0\x11\xc0\x12\xc0\x13\xc0\x14\xc0\x15\xc0\x16\xc0\x17\x

## Once the batch has finished...

In [5]:
import openai
import os

with open(f"{file_name}.id", "r") as f:
    request_id = f.read()

client = openai.Client(api_key='sk-proj-')

request = client.batches.retrieve(request_id)

if not os.path.exists(f"{file_name}-predictions.jsonl"):
    output_file = client.files.retrieve(request.output_file_id)
    output_file_content = client.files.content(output_file.id)

    with open(f"{file_name}-predictions.jsonl", "wb") as f:
        f.write(output_file_content.content)

In [None]:
# Extract JSON from each line
import re
import json

def clean_output(output):

    clean_output = output
    # Extract XML code from the output
    xml_code = re.search(r'```json(.*?)```', output, re.DOTALL)
    clean_output = ''
    if xml_code:
        clean_output = xml_code.group(1).strip()
    
    # Parse the JSON
    try:
        clean_output = json.loads(clean_output)
    except:
        return {}
    
    return clean_output

lines = pd.read_json(f"{file_name}-predictions.jsonl", lines=True)
cleaned_lines = lines.response.apply(lambda x: clean_output(x['body']['choices'][0]['message']['content']))
# cleaned_lines = [clean_output(line) for line in lines]

cleaned_lines.to_json(f"{file_name}-predictions-cleaned.jsonl", orient="records", lines=True)

Bad pipe message: %s [b'\x966l\xc5uC\x01\xbd/0\x82GSE\xffG$\x8f\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00=\x00>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00\x9e\x00\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xba\x00\xbb\x00\xbc\x00\xbd\x00\xbe\x00\xbf\x00\xc0\x00', b"\xc2\x00\xc3\x00\xc4\x00\xc5\x13\x01\x13\x02\x13\x03\x13\x04\x13\x05\xc0\x01\xc0\x02\xc0\x03\xc0\x04\xc0\x05\xc0\x06\xc0\x07\xc0\x08\xc0\t\xc0\n\xc0\x0b\xc0\x0c\xc0\r\xc0\x0e\xc0\x0f\xc0\x10\xc0\x11\xc0\x12\xc0\x13\xc0\x14\xc0\x15\xc0\x16\xc0\x17\xc0\x18\xc0\x19\xc0#\xc0$\xc0%\xc0&\xc0'\xc0(\xc0)\