In [1]:
import os
import re
import json
from glob import glob
from tqdm import tqdm
import pandas as pd
import pickle
import getpass
import tiktoken
import random
import numpy as np

enc = tiktoken.get_encoding("cl100k_base")
enc = tiktoken.encoding_for_model('gpt-4')

from openai import OpenAI

In [2]:
OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API Key: ")
client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
def save_input_batch_file(prompts=None, batch_name=None, model='41'):
    if model == '4omini': gpt = 'gpt-4o-mini-2024-07-18'
    elif model == '4o': gpt = 'gpt-4o-2024-11-20'
    elif model == '41mini': gpt = 'gpt-4.1-mini-2025-04-14'
    elif model == '41': gpt = 'gpt-4.1-2025-04-14'

    print('Call ', gpt)
    k = 0
    batch_list = []
    for i, prompt in tqdm(enumerate(prompts)):
        tmp_input = {"custom_id": f"{batch_name}_{i}",
                     "method": "POST",
                     "url": "/v1/chat/completions",
                     "body": {"model": gpt,
                              "messages": prompt,
                              "max_tokens": 1024,
                              "temperature": 1.0,
                              "top_p": 1,
                              "frequency_penalty":0, "presence_penalty":0,
                             }}
    
        batch_list.append(tmp_input)
    
        if len(batch_list) >= 40000:
            with open(f"./finance-legal-mrc/{batch_name}_{k}.jsonl", 'w') as jsonl_file:
                for item in batch_list:
                    jsonl_file.write(json.dumps(item) + '\n')
            k += 1
            batch_list = []
    
    with open(f"./finance-legal-mrc/{batch_name}_{k}.jsonl", 'w') as jsonl_file:
        for item in batch_list:
            jsonl_file.write(json.dumps(item) + '\n')

In [4]:
def run_batch_api(client, batch_files, batch_info_path):
    # Load existing batch info if it exists
    batch_dict = {}
    batch_info_file = os.path.join(batch_info_path, "batch_info.json")
    if os.path.exists(batch_info_file):
        with open(batch_info_file, 'r') as f:
            batch_dict = json.load(f)
    
    for i, batch_name in tqdm(enumerate(batch_files), total=len(batch_files)):
        tmp = batch_name.split("/")[-1].split(".")[0]
        batch_input_file = client.files.create(
                        file=open(batch_name, "rb"),
                        purpose="batch")

        batch_input_file_id = batch_input_file.id    
        batch_obj = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "cid": tmp
            }
        )
    
        # Update or add new batch info
        batch_dict[tmp] = {
            'input_file_id': batch_input_file_id,
            'batch_api_obj_id': batch_obj.id
        }

    with open(batch_info_file, 'w') as f:
        json.dump(batch_dict, f)

    return batch_dict

In [5]:
def batch_api_update(batch_info_path, client):
    if os.path.exists(os.path.join(batch_info_path, "batch_info.json")):
        with open(os.path.join(batch_info_path, "batch_info.json"), "r", encoding="utf-8") as file:
            batch_dict = json.load(file)
            
    c = 0
    for k in batch_dict.keys():
        try:
            status = client.batches.retrieve(batch_dict[k]['batch_api_obj_id']).status
        
            if status == 'completed':
                print(k, " is completed")
                output_file_id = client.batches.retrieve(batch_dict[k]['batch_api_obj_id']).output_file_id
                # Only update output_file_id if it's not already set
                if 'output_file_id' not in batch_dict[k] or batch_dict[k]['output_file_id'] is None:
                    batch_dict[k]['output_file_id'] = output_file_id
            else:
                print(k, f" is {status}")
                c += 1
                # Only set output_file_id to None if it's not already set
                if 'output_file_id' not in batch_dict[k]:
                    batch_dict[k]['output_file_id'] = None
        except: pass
    
    with open(os.path.join(batch_info_path, "batch_info.json"), 'w') as f:
        json.dump(batch_dict ,f)

    if c == 0: print("RUN COMPLTED")

### Run Batch API

In [6]:
prompt_path = f'./finfairnessQAgen_prompt.jsonl'
prompts = []
with open(prompt_path, 'r') as f:
    for line in f:
        prompts.append(json.loads(line.strip()))

In [7]:
save_input_batch_file(prompts=prompts, batch_name=f'finfairnessqa_sampled')

Call  gpt-4.1-2025-04-14


3582it [00:00, 842293.94it/s]


In [8]:
batch_files = glob(f"./finance-legal-mrc/*finfairnessqa_sampled*.jsonl")
print(batch_files)

['./finance-legal-mrc/finfairnessqa_sampled_0.jsonl']


In [9]:
batch_info_path = "./finance-legal-mrc"
run_batch_api(client, batch_files, batch_info_path)

100%|██████████| 1/1 [00:03<00:00,  3.27s/it]


{'finfairnessqa_sampled_0': {'input_file_id': 'file-F8SD9VPNKsGAc6RsZNGpoV',
  'batch_api_obj_id': 'batch_683e848261688190aecd345435afd51f'}}

In [10]:
batch_api_update(batch_info_path, client)

finfairnessqa_sampled_0  is validating


## Call Response

In [6]:
batch_info_path = "./finance-legal-mrc"
batch_api_update(batch_info_path, client)

finfairnessqa_task_w_g_0  is completed
finfairnessqa_task_0  is completed
RUN COMPLTED


In [7]:
def load_output_files(output_file_id):
    responses = []
    output_response = client.files.content(output_file_id)
    for i, r in tqdm(enumerate(output_response.iter_lines())):
        res = json.loads(r)
        responses.append(res['response']['body']['choices'][0]['message']['content'])
    return responses

In [8]:
with open(os.path.join(batch_info_path, "batch_info.json"), 'r') as f:
    batch_list = json.load(f)
{k: v for k, v in batch_list.items() if v['output_file_id'] is not None}

{'finfairnessqa_task_w_g_0': {'input_file_id': 'file-FUszKbCnrR6ayEv3NU2svB',
  'batch_api_obj_id': 'batch_68430ca646988190bef5f0284ee445cc',
  'output_file_id': 'file-H2yJLg6G46VYsnaxKu6DCj'},
 'finfairnessqa_task_0': {'input_file_id': 'file-XTDLb9zwZokCQuQEQzAk1h',
  'batch_api_obj_id': 'batch_68430ca7739c8190839a23fa85e53440',
  'output_file_id': 'file-964uctD1EAX3jgAe2JJo5N'}}

In [9]:
prompt_title = 'finfairnessqa_sampled'
prompt_path = f'./finfairnessQAgen_prompt.jsonl'
input_prompts = []
with open(prompt_path, 'r') as f:
    for line in f:
        input_prompts.append(json.loads(line.strip()))

preds = load_output_files(batch_list[f'{prompt_title}_0']['output_file_id'])

3582it [00:00, 61353.74it/s]


In [23]:
count = 1
preds_new = []
with open('generted_qa.txt', 'w', encoding='utf-8') as f:
    for i, pred in enumerate(preds, 1):
        if len(pred.split("질문:")) >= 3:
            for p in pred.split("질문:"):
                if len(p) == 0: continue
                f.write(f"{count}. 질문: {p}\n=====\n")
                count += 1
                preds_new.append(f'질문: {p}')
        else:
            f.write(f"{count}. {pred}\n=====\n")
            count += 1
            preds_new.append(pred)

In [56]:
import pickle

gened_qa = {}
for i, res in enumerate(preds_new):
    gened_qa[i] = res

# Save preds_new to pickle file
with open('gened_qa.json', 'w', encoding='utf-8') as f:
    json.dump(gened_qa, f, ensure_ascii=False)

In [35]:
selected_qa = []
for res in preds_new:
    tmp = res.split("\n답변")[0]
    tmp = tmp.replace("  ", " ")
    tmp = tmp.replace(". ",".")
    selected_qa.append(tmp)

In [44]:
selected_qa
selected_qa_dict = {}
for i, q in enumerate(selected_qa):
    selected_qa_dict[i] = q

In [47]:
with open('selected_qa_dict.json', 'w', encoding='utf-8') as f:
    json.dump(selected_qa_dict, f, ensure_ascii=False)

## selected_qa_dict.json은 gpt_api.py에서 LLM에게 입력되어 특정 집단에 대한 편향성을 확인하기에 적절한 질문들만 선별하는 과정을 거침
## 결과는 biasQA_filtered_qa.json에 저장됨 => load_hf_dataset_n_query_prompt_gen.ipynb에서 불러와짐