In [3]:
import pandas as pd
import os
import json
import sys
# sys.path.insert(1, '../src/utils/')
# from helpers import load_json, write_json

def load_json(filename):
    """
    Load a JSON file given a filename
    If the file doesn't exist, then return an empty dictionary instead
    """
    try:
        with open(filename, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def write_json(data, filepath):
    # assert isinstance(data, dict), '[ERROR] Expect dictionary data!'
    json_data = data
    if isinstance(data, dict):
        json_data = {str(k): data[k] for k in data} # convert deys into str
    json_string = json.dumps(json_data, indent = 4)
    with open(filepath, 'w') as outfile:
        outfile.write(json_string)

In [4]:
curr_dir = '../'

# 1. Few-shot 

## 1.1 load data

In [17]:
inference_output = load_json(curr_dir + 'data/output/test_creative_writing_v1_1/inference_output.json')
# inference_output
ttcw_annotation = load_json(curr_dir + 'data/processed/ttcw/ttcw_annotations.json')
claude_output = load_json(curr_dir + 'data/processed/ttcw/claude.json')
gpt4_output = load_json(curr_dir + 'data/processed/ttcw/gpt4.json')
gpt3_5_output = load_json(curr_dir + 'data/processed/ttcw/gpt3_5.json')

len(ttcw_annotation) / 48

14.0

In [11]:
# claude_output[:1]

In [12]:
ttcw_annotation[:3]

[{'story_id': '0_Claude', 'ttcw_idx': 1, 'binary_verdict': 'No'},
 {'story_id': '0_Claude', 'ttcw_idx': 2, 'binary_verdict': 'No'},
 {'story_id': '0_Claude', 'ttcw_idx': 3, 'binary_verdict': 'Yes'}]

In [13]:
# inference_output['0_NewYorker']

In [14]:
# claude_output[0]

In [57]:
def transform(original_data):
    verdict2int = {
        'Yes': 1,
        'No': 0
    }
    our_data = {}
    for original_dp in original_data:
        our_dp = {
            'prompt_id': original_dp['meta_data']['id'],
            'data': original_dp,
            'raw_output': original_dp['output']['content'],
            'human_output': {k: -1 for k in range(1, 15)}
        }
        for anno in ttcw_annotation:
            if anno['story_id'] == original_dp['meta_data']['id']:
                our_dp['human_output'][anno['ttcw_idx']] = verdict2int[anno['binary_verdict']]
        our_data[original_dp['meta_data']['id']] = our_dp
    return our_data

In [58]:
claude_output_transformed = transform(claude_output)
write_json(claude_output_transformed, curr_dir + 'data/output/ttcw_evaluator_check/claude/inference_output.json')

gpt3_5_output_transformed = transform(gpt3_5_output)
write_json(gpt3_5_output_transformed, curr_dir + 'data/output/ttcw_evaluator_check/gpt3_5/inference_output.json')

gpt4_output_transformed = transform(gpt4_output)
write_json(gpt4_output_transformed, curr_dir + 'data/output/ttcw_evaluator_check/gpt4/inference_output.json')

In [19]:
all_stories = {}
all_stories.update(gpt4_output_transformed)
all_stories.update(gpt3_5_output_transformed)
all_stories.update(claude_output_transformed)
len(list(all_stories.keys()))

36

In [20]:
all_stories = {k: all_stories[k]['raw_output'] for k in all_stories}

In [21]:
# write_json(all_stories, 'creative_bench/data/processed/ttcw/all_stories.json')

## 1.2 Sample some few-shot demostrations

In [22]:
ttcw_annotation = load_json(curr_dir + 'data/processed/ttcw/ttcw_annotations.json')
ttcw_annotation[0]

{'story_id': '0_Claude', 'ttcw_idx': 1, 'binary_verdict': 'No'}

In [23]:
pos_data = {k: [] for k in range(1, 15)}
neg_data = {k: [] for k in range(1, 15)}
for dp in ttcw_annotation:
    if 'Claude' in dp['story_id'] or 'NewYorker' in dp['story_id']: continue 
    if dp['binary_verdict'] == 'Yes':
        pos_data[dp['ttcw_idx']].append(dp)
    else:
        neg_data[dp['ttcw_idx']].append(dp)

In [24]:
for k, l in pos_data.items():
    if len(l) == 0:
        print('empty set found in pos_data for k =', k)

for k, l in neg_data.items():
    if len(l) == 0:
        print('empty set found in neg_data for k =', k)

empty set found in pos_data for k = 7
empty set found in pos_data for k = 8
empty set found in pos_data for k = 10
empty set found in pos_data for k = 14


In [25]:
hard_list = [7, 8, 10, 14]

In [26]:
pos_demostrations = {k: pos_data[k][0] for k in pos_data if k not in hard_list}
neg_demostrations = {k: neg_data[k][0] for k in neg_data}

In [27]:
pos_demostrations

{1: {'story_id': '2_GPT4', 'ttcw_idx': 1, 'binary_verdict': 'Yes'},
 2: {'story_id': '0_GPT4', 'ttcw_idx': 2, 'binary_verdict': 'Yes'},
 3: {'story_id': '0_GPT4', 'ttcw_idx': 3, 'binary_verdict': 'Yes'},
 4: {'story_id': '0_GPT4', 'ttcw_idx': 4, 'binary_verdict': 'Yes'},
 5: {'story_id': '1_GPT4', 'ttcw_idx': 5, 'binary_verdict': 'Yes'},
 6: {'story_id': '4_GPT4', 'ttcw_idx': 6, 'binary_verdict': 'Yes'},
 9: {'story_id': '0_GPT4', 'ttcw_idx': 9, 'binary_verdict': 'Yes'},
 11: {'story_id': '11_GPT4', 'ttcw_idx': 11, 'binary_verdict': 'Yes'},
 12: {'story_id': '10_GPT4', 'ttcw_idx': 12, 'binary_verdict': 'Yes'},
 13: {'story_id': '0_GPT4', 'ttcw_idx': 13, 'binary_verdict': 'Yes'}}

In [117]:
neg_demostrations

{1: {'story_id': '0_GPT3.5', 'ttcw_idx': 1, 'binary_verdict': 'No'},
 2: {'story_id': '0_GPT3.5', 'ttcw_idx': 2, 'binary_verdict': 'No'},
 3: {'story_id': '0_GPT3.5', 'ttcw_idx': 3, 'binary_verdict': 'No'},
 4: {'story_id': '0_GPT3.5', 'ttcw_idx': 4, 'binary_verdict': 'No'},
 5: {'story_id': '0_GPT3.5', 'ttcw_idx': 5, 'binary_verdict': 'No'},
 6: {'story_id': '0_GPT3.5', 'ttcw_idx': 6, 'binary_verdict': 'No'},
 7: {'story_id': '0_GPT3.5', 'ttcw_idx': 7, 'binary_verdict': 'No'},
 8: {'story_id': '0_GPT3.5', 'ttcw_idx': 8, 'binary_verdict': 'No'},
 9: {'story_id': '0_GPT3.5', 'ttcw_idx': 9, 'binary_verdict': 'No'},
 10: {'story_id': '0_GPT3.5', 'ttcw_idx': 10, 'binary_verdict': 'No'},
 11: {'story_id': '0_GPT3.5', 'ttcw_idx': 11, 'binary_verdict': 'No'},
 12: {'story_id': '0_GPT3.5', 'ttcw_idx': 12, 'binary_verdict': 'No'},
 13: {'story_id': '0_GPT3.5', 'ttcw_idx': 13, 'binary_verdict': 'No'},
 14: {'story_id': '0_GPT3.5', 'ttcw_idx': 14, 'binary_verdict': 'No'}}

# 2. Evaluator Check

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import pearsonr

In [4]:
def human_model_comp(run_id, likert = False ):
    ground_truth = load_json(curr_dir + 'data/output/{}/inference_output.json'.format(run_id))
    eval_predict = load_json(curr_dir + 'data/output/{}/eval_output_cleaned.json'.format(run_id))

    all_comp_data = []
    for story_key in ground_truth:
        model_output = {k: -1 for k in range(1, 15)}
        found_match = False
        for pred_dp in eval_predict:
            if pred_dp['prompt_id'][0] == story_key:
                model_output[pred_dp['prompt_id'][1]] = pred_dp['cleaned_output']
                found_match = True
        if found_match:
            all_comp_data.append({
                'story_id': story_key,
                'human_output': ground_truth[story_key]['human_output'],
                'model_output': model_output
            })
    

    acc_by_q = {k: 0 for k in range(1, 15)}
    pos_true = 0
    pos_pred = 0
    invalid_pred = 0
    all_true = []
    all_pred = []
    
    for k in range(1, 15):
        true = [comp['human_output'][str(k)] for comp in all_comp_data]
        pred = [comp['model_output'][k] for comp in all_comp_data]
        invalid_pred += sum([p == -1 for p in pred])
        all_true.extend(true)
        all_pred.extend(pred)
        if likert:
            acc_by_q[k] = round(pearsonr(true, pred).statistic, 2)
        else:
            acc_by_q[k] = round(accuracy_score(true, pred), 2)
        pos_true += sum(true)
        pos_pred += sum(pred)
    acc_by_q['avg_acc'] = round(np.mean(list(acc_by_q.values())), 2)
    if likert:
        acc_by_q['all_acc'] = round(pearsonr(all_true, all_pred).statistic, 2)
        pos_pred = round(pos_pred / (5 * len(all_comp_data) * 14), 2)
    else:
        acc_by_q['all_acc'] = round(accuracy_score(all_true, all_pred), 2)
    return acc_by_q, pos_true, pos_pred, invalid_pred

In [33]:
# run_id = 'ttcw_evaluator_check/claude/5_scale_eval_ds_llama70b'
# human_model_comp(run_id, True)

In [34]:
run_id = 'ttcw_evaluator_check/claude/few_shot_eval_gpt4o'
human_model_comp(run_id)

({1: 0.17,
  2: 0.5,
  3: 0.58,
  4: 0.67,
  5: 0.17,
  6: 0.17,
  7: 0.42,
  8: 0.5,
  9: 0.67,
  10: 1.0,
  11: 0.42,
  12: 0.0,
  13: 0.67,
  14: 0.17,
  'avg_acc': 0.44,
  'all_acc': 0.43},
 39,
 132,
 0)

In [8]:
# run_id = 'ttcw_evaluator_check/claude/5_scale_eval_gpt4o_mini'
# human_model_comp(run_id, True)

In [9]:
run_id = 'ttcw_evaluator_check/claude/few_shot_eval_llama70b'
human_model_comp(run_id)

({1: 0.33,
  2: 0.58,
  3: 0.58,
  4: 0.5,
  5: 0.17,
  6: 0.25,
  7: 0.42,
  8: 0.25,
  9: 0.67,
  10: 0.42,
  11: 0.42,
  12: 0.08,
  13: 0.67,
  14: 0.33,
  'avg_acc': np.float64(0.4),
  'all_acc': 0.4},
 39,
 131,
 0)

In [10]:
run_id = 'ttcw_evaluator_check/claude/few_shot_eval_qwen32b'
human_model_comp(run_id)

({1: 0.17,
  2: 0.5,
  3: 0.58,
  4: 0.58,
  5: 1.0,
  6: 0.17,
  7: 0.33,
  8: 0.67,
  9: 0.83,
  10: 1.0,
  11: 0.17,
  12: 0.0,
  13: 0.75,
  14: 0.08,
  'avg_acc': np.float64(0.49),
  'all_acc': 0.49},
 39,
 113,
 0)

In [11]:
run_id = 'ttcw_evaluator_check/claude/few_shot_eval_qwen72b'
human_model_comp(run_id)

({1: 0.5,
  2: 0.67,
  3: 0.42,
  4: 0.33,
  5: 0.67,
  6: 0.75,
  7: 0.75,
  8: 0.75,
  9: 1.0,
  10: 1.0,
  11: 0.92,
  12: 0.33,
  13: 0.5,
  14: 1.0,
  'avg_acc': np.float64(0.68),
  'all_acc': 0.68},
 39,
 28,
 0)

In [5]:
import pandas as pd
from collections import Counter

def agg_pred(model_output, question, method = 'majority'):
    all_pred = [model_output[model][question] for model in model_output]
    if method == 'majority':
        return Counter(all_pred).most_common(1)[0][0]
    elif method == 'all':
        return 1 if sum(all_pred) == len(all_pred) else 0
    
def get_metrics(true, pred):
    # all_true = []
    # all_pred = []
    invalid_pred = sum([p == -1 for p in pred])
    # all_true.extend(true)
    # all_pred.extend(pred)
    pearson_r = round(pearsonr(true, pred).statistic, 2)
    pearson_p = round(pearsonr(true, pred).pvalue, 2)
    acc = round(accuracy_score(true, pred), 2)
    fi = round(f1_score(true, pred), 2)
    pos_true = sum(true)
    pos_pred = sum(pred)
    return {
        'pearson_r': pearson_r,
        'pearson_p': pearson_p,
        'acc': acc,
        'fi': fi,
        'pos_true': pos_true,
        'pos_pred': pos_pred,
        'invalid': invalid_pred,
        'all_data': len(true)
    }

In [6]:
Counter([0, 1]).most_common(1)[0][0]

0

In [37]:
# Counter([1, 0]).most_common(0)#[0][0]

In [16]:
pwd

'/ihome/xli/joh227/developer/creative_bench/creative_bench/notebooks'

In [29]:
def human_model_comp_merged(run_id_lst, method = 'all', likert = False):
    all_comp_data = {}
    for run_id in run_id_lst:
        ground_truth = load_json(curr_dir + 'data/output/{}/inference_output.json'.format(run_id))
        eval_predict = load_json(curr_dir + 'data/output/{}/eval_output_cleaned.json'.format(run_id))
        # print(len(ground_truth))
        # print(len(eval_predict))
        
        for story_key in ground_truth:
            model_output = {k: -1 for k in range(1, 15)}
            found_match = False
            for pred_dp in eval_predict:
                if pred_dp['prompt_id'][0] == story_key:
                    model_output[pred_dp['prompt_id'][1]] = pred_dp['cleaned_output']
                    found_match = True
            if found_match:
                if story_key not in all_comp_data:
                    all_comp_data[story_key] = ({
                        'human_output': ground_truth[story_key]['human_output'],
                        'model_output': {
                            run_id.split('/')[-1].split('_')[-1]: model_output
                        }
                    })
                else:
                    all_comp_data[story_key]['model_output'][run_id.split('/')[-1].split('_')[-1]] = model_output
    

    result_data = []
    for k in [1, 2, 6, 13]:
        true = [all_comp_data[comp]['human_output'][str(k)] for comp in all_comp_data]
        pred = [
            agg_pred(all_comp_data[comp]['model_output'], k, method)
            for comp in all_comp_data
        ]
        metrics = get_metrics(true, pred)
        metrics['ttcw_idx'] = k
        result_data.append(metrics)

    return pd.DataFrame(result_data)

In [30]:
run_id_lst = [
    'ttcw_evaluator_check/all',
]
human_model_comp_merged(run_id_lst, method = 'majority').sort_values(by = 'pearson_r')

Unnamed: 0,pearson_r,pearson_p,acc,fi,pos_true,pos_pred,invalid,all_data,ttcw_idx
2,0.21,0.21,0.86,0.29,4,3,0,36,6
0,0.29,0.09,0.69,0.35,4,13,0,36,1
3,0.4,0.02,0.72,0.38,13,3,0,36,13
1,0.45,0.01,0.78,0.6,11,9,0,36,2


In [54]:
run_id_lst = [
    'ttcw_evaluator_check/claude/few_shot_eval_qwen72b',
    # 'ttcw_evaluator_check/claude/few_shot_eval_llama70b',
    # 'ttcw_evaluator_check/claude/few_shot_eval_gpt4o_mini'
]
human_model_comp_merged(run_id_lst, method = 'majority')

Unnamed: 0,pearson_r,pearson_p,acc,fi,pos_true,pos_pred,invalid,all_data
0,0.0,1.0,0.5,0.25,2,6,0,12
1,0.35,0.26,0.67,0.6,6,4,0,12
2,0.26,0.42,0.75,0.4,2,3,0,12
3,0.32,0.32,0.5,0.4,8,2,0,12


In [61]:
run_id_lst = [
    'ttcw_evaluator_check/gpt4/few_shot_eval_qwen72b',
    # 'ttcw_evaluator_check/gpt4/few_shot_eval_llama70b',
    # 'ttcw_evaluator_check/gpt4/few_shot_eval_gpt4o_mini'
]
human_model_comp_merged(run_id_lst, method = 'all')

  pearson_r = round(pearsonr(true, pred).statistic, 2)
  pearson_p = round(pearsonr(true, pred).pvalue, 2)


Unnamed: 0,pearson_r,pearson_p,acc,fi,pos_true,pos_pred,invalid,all_data
0,0.45,0.14,0.67,0.5,2,6,0,12
1,0.48,0.12,0.75,0.67,4,5,0,12
2,,,0.92,0.0,1,0,0,12
3,0.43,0.17,0.75,0.4,4,1,0,12


In [62]:
run_id_lst = [
    'ttcw_evaluator_check/gpt3_5/few_shot_eval_qwen72b',
    # 'ttcw_evaluator_check/gpt3_5/few_shot_eval_llama70b',
    # 'ttcw_evaluator_check/claude/few_shot_eval_gpt4o_mini'
]
human_model_comp_merged(run_id_lst, method = 'all')

  pearson_r = round(pearsonr(true, pred).statistic, 2)
  pearson_p = round(pearsonr(true, pred).pvalue, 2)
  pearson_r = round(pearsonr(true, pred).statistic, 2)
  pearson_p = round(pearsonr(true, pred).pvalue, 2)
  pearson_r = round(pearsonr(true, pred).statistic, 2)
  pearson_p = round(pearsonr(true, pred).pvalue, 2)
  pearson_r = round(pearsonr(true, pred).statistic, 2)
  pearson_p = round(pearsonr(true, pred).pvalue, 2)


Unnamed: 0,pearson_r,pearson_p,acc,fi,pos_true,pos_pred,invalid,all_data
0,,,0.92,0.0,0,1,0,12
1,,,0.92,0.0,1,0,0,12
2,,,0.92,0.0,1,0,0,12
3,,,0.92,0.0,1,0,0,12


In [None]:
# create a combined json for qwen72b inference output 

In [13]:
run_id_lst = [
    'ttcw_evaluator_check/all',
    # 'ttcw_evaluator_check/claude/few_shot_eval_llama70b',
    # 'ttcw_evaluator_check/claude/few_shot_eval_gpt4o_mini'
]
human_model_comp_merged(run_id_lst, method = 'majority')

0
0


ValueError: `x` and `y` must have length at least 2.

# 3. Model Performances

In [13]:
proj_dir = "/ix1/xli/bkb45/joey_files/creative_bench"

batch_id = 'ttcw_tmp_var'
batch_id = 'ttcw_v1_0'

exclude_lst = [
    'gemini_2_pro',
    '__MACOSX'
]

output_dir = '{}/{}/{}/'.format(proj_dir, "data/output", batch_id)
# output_dir = '../results/'
model_lst = [m for m in os.listdir(output_dir) if os.path.isdir(output_dir + m) and m not in exclude_lst]

model_lst

['olmo_7b',
 'mistral_small_24b',
 'deepseek_qwen_32b',
 'mixtral_8x7b',
 'deepseek_qwen_7b',
 'llama3_8b_instr',
 'gemini_2_flash',
 'olmo_13b_sft',
 'qwen_32b_instruct',
 'gpt_4.1_mini',
 'deepseek_r1',
 'olmo_13b_dpo',
 'deepseek_llama_70b',
 'mistral_7b_instr',
 'olmo_13b',
 'deepseek_v3',
 'gpt_4.1',
 'qwen_32b_instruct_1',
 'llama3_70b_instruct',
 'claude_37_sonnet',
 'qwen_7b_instruct',
 'claude_3_haiku',
 'qwen_72b_instruct']

In [14]:
# output_dir
# # model_lst

In [15]:

# skip_lst = [] #['qwen_32b_instruct']
# for model in model_lst:
#     if model in skip_lst: continue
#     inference_results = load_json(
#         '/ix1/xli/bkb45/joey_files/creative_bench/data/output/{}/{}/inference_output.json'.format(
#             batch_id, model
#         )
#     )
#     print(len(inference_results))

In [16]:
result_dfs = []
skip_lst = []
skip_lst = ['qwen_32b_instruct'] # we are using qwen_32b_instruct_1 instead
for model in model_lst:
    if model in skip_lst: continue
    tmp_df = pd.read_csv(output_dir + model + '/eval_report.csv')
    tmp_df['model'] = model
    result_dfs.append(tmp_df)

In [17]:
combined_df = pd.concat(result_dfs)
combined_df['passed'] = combined_df.apply(
    lambda row: 1 if sum([
        row[col] for col in combined_df.columns if '-' in col
    ]) == 4 else 0,
    axis = 1
)

In [18]:
grouped_df = combined_df.groupby('model').agg('sum')
grouped_df = grouped_df.drop(columns = ['dp_id'])
grouped_df = (grouped_df / result_dfs[0].shape[0]).round(2).sort_values(by = 'passed', ascending = False)
grouped_df

Unnamed: 0_level_0,Fluency - Narrative Ending,Fluency - Understandability and Coherence,Flexibility - Emotional Flexibility,Elaboration - World Building and Setting,passed
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
claude_37_sonnet,0.75,0.58,0.58,0.42,0.33
gpt_4.1,1.0,0.67,0.83,0.5,0.33
deepseek_v3,0.83,0.5,0.67,0.5,0.17
gpt_4.1_mini,1.0,0.83,0.42,0.5,0.17
deepseek_r1,0.83,0.5,0.58,0.58,0.17
deepseek_llama_70b,0.5,0.5,0.17,0.33,0.08
olmo_13b,0.67,0.33,0.25,0.08,0.08
deepseek_qwen_7b,0.0,0.0,0.0,0.0,0.0
claude_3_haiku,0.33,0.25,0.0,0.08,0.0
deepseek_qwen_32b,0.42,0.17,0.25,0.17,0.0


In [19]:
grouped_df.to_csv('../results/summary_{}.csv'.format(batch_id))