In [None]:
import os
import pandas as pd
from model import *
from prompt import *
from batch import *
from generate_json import *

### Read file and generate prompts

In [None]:
filepath = 'check_papers.csv'

filename = filepath.split('.')[0]
df = pd.read_csv(filepath)
df['prompt'] = df.apply(create_check_prompt, axis=1)
print(len(df))
df.head(5)

### Generate chunked jsonl file for batch input

In [None]:
def process_df_in_chunks(df, model_name, custom_id_column, prompt_column, sav_dir, save_filename, chunk_size):
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk = df.iloc[start:end]
        save_file = os.path.join(sav_dir, f'{save_filename}_{i+1}.jsonl')
        generate_jsonl(chunk, model_name, custom_id_column, prompt_column, save_file)
        print(f"Generated {save_file}")

sav_dir = filename
save_filename = filename
if not os.path.exists(sav_dir):
    os.makedirs(sav_dir, exist_ok=True)

model_name = "gpt-4o"
custom_id_column = 'id'
prompt_column = 'prompt'
process_df_in_chunks(df, model_name, custom_id_column, prompt_column, sav_dir, save_filename, chunk_size=200)

### Process batch input to GPT

In [None]:
openai_key = get_openai_key(SECRET_FILE)
processor = OpenAIBatchProcessor(openai_key)

endpoint = "/v1/chat/completions"
completion_window = "24h"

read_dir = filename
save_dir = filename + '_response'

if not os.path.exists(save_dir):
    os.makedirs(save_dir, exist_ok=True)

read_files = os.listdir(read_dir)
read_files = [file for file in read_files if file.endswith('jsonl')]
read_files = sorted(read_files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

for file in read_files:
    read_file = os.path.join(read_dir, file)
    save_file = os.path.join(save_dir, file)
    results = processor.process_batch(read_file, endpoint, completion_window, save_file)
    print(f"Processed {read_file} and saved to {save_file}")

### Extract content

In [None]:
files = os.listdir(save_dir)
files = [file for file in files if file.endswith('jsonl')]
files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

data = []

for file in files:
    read_file = os.path.join(save_dir, file)
    with open(read_file, 'r', encoding='utf-8') as f:
        for line in f:
            json_obj = json.loads(line.strip())
            try:
                custom_id = json_obj['custom_id']
                content = json_obj['response']['body']['choices'][0]['message']['content']
                data.append({'custom_id': custom_id, 'content': content})
            except KeyError:
                continue

df = pd.DataFrame(data)
df = df.drop_duplicates(subset=['custom_id'])
df.head(5)

In [None]:
df1 = pd.read_csv(filepath)
df1 = df1.drop_duplicates(subset=['id'])
df2 = df.merge(df1, left_on='custom_id', right_on='id', how='left')
df2 = df2.drop(columns=['custom_id'])

In [None]:
output_filepath = save_dir + '.csv'

df2 = df2.rename(columns={'content': 'possible'})
cols = df2.columns.tolist()
cols.append(cols.pop(cols.index('possible')))
df2 = df2[cols]
df2.to_csv(output_filepath, index=False)