In [None]:
# rename openai output files
import os
import shutil

with open('tmp.txt') as fin:
    lines = [line.strip() for line in fin.readlines()]
    print(len(lines))
    input_filenames = lines[::2]
    output_filenames = lines[1::2]
    assert len(input_filenames) == len(output_filenames)
mapping = {o:i for i, o in zip(input_filenames, output_filenames)}

openai_output_dir = './output/openai_output_w_settings'
filenames = os.listdir(openai_output_dir)
for old_name in filenames:
    new_name = mapping[old_name]
    shutil.move(
        os.path.join(openai_output_dir, old_name), 
        os.path.join(openai_output_dir, new_name)
    )

In [None]:
# merge chunked openai output files
from collections import defaultdict
merged_output_dir = './output/openai_output_w_settings_merged'
os.makedirs(merged_output_dir, exist_ok=True)

storing = defaultdict(list)
for filename in os.listdir(openai_output_dir):
    prefix = '_'.join(filename.split('_')[:-1])
    storing[prefix].append(filename)

for prefix, file_list in storing.items():
    print(prefix)
    output_filename = os.path.join(merged_output_dir, f'{prefix}.jsonl')
    with open(output_filename, 'w') as fout:
        for file in file_list:
            # print(file)
            with open(os.path.join(openai_output_dir, file), 'r') as fin:
                lines = fin.readlines()
            fout.writelines(lines)

In [3]:
# reformat captioning results
import os
import json
import glob

base_input_dir = './output/openai_output_w_settings_merged'
k = 3
task = 'generation'
output_dir = f'./output/image_caption_{task}'

base_data_dir = '/home/ubuntu/MMSci/mmsci-data/benchmark/test/'
input_data = json.load(open(os.path.join(base_data_dir, f'image_caption_{task}_data.json')))
input_data_mapping = {i: item for i, item in enumerate(input_data)}

def reformat_caption_generation(input_filepath, output_filepath):
    output_list = []
    with open(input_filepath, 'r') as fin:
        for line in fin.readlines():
            info = json.loads(line)
            key = int(info['custom_id'])
            answers = []
            for ans_info in info['response']['body']["choices"]:
                answers.append(ans_info['message']['content'])
            info = input_data_mapping[key]
            info['prediction'] = answers
            output_list.append(info)
    with open(output_filepath, 'w') as fout:
        json.dump(output_list, fout, indent=4)

In [8]:
file_list = glob.glob(os.path.join(base_input_dir, f'*{task}*'))

for filepath in file_list:
    filename = filepath.split('/')[-1]
    # print(filename)
    model_name = filename.split('_')[0]
    w_abs = filename.find('w_abstract') > -1
    w_content = filename.find('w_content') > -1
    tag = f'abs{str(w_abs)}_ctx{str(w_content)}'
    # print(tag, model_name)
    reformat_caption_generation(
        input_filepath=filepath,
        output_filepath=os.path.join(output_dir, tag, f'k_{k}', f'{model_name}.json')
    )

In [23]:
# reformat matching results
import os
import json
import glob
from collections import defaultdict

base_input_dir = './output/openai_output_w_settings_merged'
k = 5
task = 'matching'
output_dir = f'./output/image_caption_{task}'

base_data_dir = '/home/ubuntu/MMSci/mmsci-data/benchmark/test/'
all_input_data = json.load(open(os.path.join(base_data_dir, f'image_caption_{task}_data.json')))
input_data_mapping = defaultdict(dict)
for setting, input_data in enumerate(all_input_data):
    input_data_mapping[setting+1] = {i: item for i, item in enumerate(input_data)}
print(input_data_mapping.keys())

def reformat_caption_matching(input_filepath, model_name, tag):
    all_output_list = defaultdict(list)
    with open(input_filepath, 'r') as fin:
        for line in fin.readlines():
            info = json.loads(line)
            setting, key = info['custom_id'].split('_')  # f'{setting+1}_{str(idx)}'
            setting, key = int(setting), int(key)
            answers = []
            for ans_info in info['response']['body']["choices"]:
                answers.append({
                    'answer': ans_info['message']['content']
                })
            info = input_data_mapping[setting][key]
            info['prediction'] = answers
            all_output_list[setting].append(info)
    for setting, output_list in all_output_list.items():
        output_filepath = os.path.join(output_dir, tag, f'setting-{setting}', f'k_{k}', f'{model_name}.json')
        with open(output_filepath, 'w') as fout:
            json.dump(output_list, fout, indent=4)

dict_keys([1, 2, 3])


In [24]:
file_list = glob.glob(os.path.join(base_input_dir, f'*{task}*'))

for filepath in file_list:
    filename = filepath.split('/')[-1]
    print(filename)
    model_name = filename.split('_')[0]
    tag = 'w_cot' if filename.find('w-cot') > -1 else 'wo_cot'
    print(tag, model_name)
    reformat_caption_matching(
        input_filepath=filepath, 
        model_name=model_name, 
        tag=tag,
    )

gpt-4-turbo_w-cot_image_caption_matching_data.jsonl
w_cot gpt-4-turbo
gpt-4-turbo_wo-cot_image_caption_matching_data.jsonl
wo_cot gpt-4-turbo
gpt-4o_wo-cot_image_caption_matching_data.jsonl
wo_cot gpt-4o
gpt-4o_w-cot_image_caption_matching_data.jsonl
w_cot gpt-4o
