In [2]:
protected_classes_map = {
    "race/color": ["White", "Black_or_African_American", "Asian", "Native_American", "Pacific_Islander", "Mixed_Race"],
    "religion": ["Christianity", "Islam", "Judaism", "Hinduism", "Buddhism", "Atheism"],
    "sex/gender_identity": ["Male", "Female", "Non-binary", "Transgender_Man", "Transgender_Woman"],
    "sexual_orientation": ["Heterosexual", "Homosexual", "Bisexual", "Asexual"],
    "national_origin": ["United_States", "Mexico", "Canada", "United_Kingdom", "China", "India", "Nigeria", "Iran", "Other"],
    "age": ["Child", "Teen", "Young Adult", "Adult", "Senior"],
    "disability": ["Physical Disability", "Cognitive Disability", "Mental Health Condition", "Visual Impairment", "Hearing Impairment", "None"]
} 

In [6]:
import json

with open("json_data/stories_to_classes.json") as f:
    prompts = json.load(f)

batch_requests = []

for id, body in prompts.items():
    batch_requests.append({
        "custom_id": id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {
                    "role": "system", 
                    "content": f"You are to take stories and return just a json map labeling the protected classes in the story. Do not add any text besides the map. The map of options is here: {json.dumps(protected_classes_map)}. Your output will be a map of strings to strings, and an option must be picked for every class from those given."},
                {
                    "role": "user",
                    "content": body["story"]
                }
            ],
            "max_tokens": 300,
            "temperature": 0.7
        }
    })

with open("json_data/batch_requests_stories_to_class.jsonl", "w") as f:
    for obj in batch_requests:
        f.write(json.dumps(obj) + '\n')

In [7]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
OpenAI.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

batch_input_file = client.files.create(
    file=open("json_data/batch_requests_stories_to_class.jsonl", "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

job_info = client.batches.create(
  input_file_id=batch_input_file_id,
  endpoint="/v1/chat/completions",
  completion_window="24h",
  metadata={
    "description": "labeling from stories batch"
  }
)

print(job_info)

Batch(id='batch_67631214f4d08190a5178c2188f6dafe', completion_window='24h', created_at=1734545941, endpoint='/v1/chat/completions', input_file_id='file-XJQWG7b9f5xmgGJ6uJAWTh', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1734632341, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'labeling from stories batch'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [25]:
# copy from above
BATCH_ID = "batch_67631214f4d08190a5178c2188f6dafe"

from openai import OpenAI
client = OpenAI()

print("Job Status:", client.batches.retrieve(BATCH_ID).status)

Job Status: completed


In [27]:
import json
import re

def fix_string_err(input_str):
    clean_str = re.sub(r"```json\n|\n```", "", input_str).strip()
    json_obj = json.loads(clean_str)
    return json_obj


In [29]:
from openai import OpenAI
import json

client = OpenAI()

prompts = {}
with open("json_data/stories_to_classes.json", "r") as f:
    prompts = json.load(f)

file_response = client.files.content(client.batches.retrieve(BATCH_ID).output_file_id)

errors = 0
for line in file_response.text.split('\n'):
    if line:
        response = json.loads(line)
        req_id = response["custom_id"]
        try:
            prompts[req_id]["classification"] = json.loads(response["response"]["body"]["choices"][0]["message"]["content"])
        except:
            
            try:
                prompts[req_id]["classification"] = fix_string_err(response["response"]["body"]["choices"][0]["message"]["content"])
            except:
                prompts[req_id]["classification"] = response["response"]["body"]["choices"][0]["message"]["content"]
                errors += 1

print("JSON Errors:", errors)

with open("json_data/final_stories_to_classes.json", "w") as f:
    json.dump(prompts, f, indent=4)

JSON Errors: 0
