# 1 Setup

In [18]:
import pickle
import pandas as pd
import numpy as np
from scipy import stats

In [19]:
mturk_data_dir = '../data/mturk_data/'
output_dir = '../data/outputs/'
original_data_dir = '../data/original_data/'

In [20]:
task_list = [
    "atypicality", 
    "creativity", 
    "originality"
]

# 2 Intrinsic Eval

In [21]:
def eval_intrinsic(intrinsic_pred_task):
    # single_label 
    true_labels = [pred['true_average'] for pred in intrinsic_pred_task]
    pred_labels = [pred['labels'][0] for pred in intrinsic_pred_task]
    # if self.debug:
    #     print('true_labels[:3]:', true_labels[:3])
    #     print('pred_labels[:3]:', pred_labels[:3])
    # res = stats.pearsonr(true_labels, pred_labels)
    single_label_corr = stats.spearmanr(true_labels, pred_labels) # get pearson r
    
    # distribution
    distribution_diff = None
    pred_distributions = [[p[0] for p in pred['label_distribution']] for pred in intrinsic_pred_task]
    true_distributions = [pred['true_distribution'] for pred in intrinsic_pred_task]
    if len(true_distributions) > 0 and len(true_distributions[0]) > 0:
        # max_len = max([len(pred_distributions), len(true_distributions)])
        # print("pred_distributions[:max_len]:", pred_distributions[:max_len])
        # print("true_distributions[:max_len]:", true_distributions[:max_len])
        distribution_diff = []
        for i in range(len(pred_distributions)):
            # max_len = max([len(pred_distributions), len(true_distributions)])
            distribution_diff.append(stats.kstest(
                pred_distributions[i], true_distributions[i]
            ))
        # distribution_diff = stats.kstest(pred_distributions[:max_len], true_distributions[:max_len])
        # distribution_diff = stats.kstest(pred_distributions, true_distributions)
    
    # disagreement
    pred_disagreements = [pred['disagreements'][0] for pred in intrinsic_pred_task]
    true_disagreements = [pred['true_disagreement'] for pred in intrinsic_pred_task]
    # print("pred_disagreements:", pred_disagreements)
    # print("true_disagreements:", true_disagreements)
    disagreement_corr = stats.spearmanr(pred_disagreements, true_disagreements) #.statistic # get pearson r

    return {
        "single_label_corr": round(single_label_corr.statistic, 4),
        "single_label_corr_p": round(single_label_corr.pvalue, 4),
        "distribution_ks": np.mean([round(d.statistic, 4) for d in distribution_diff]) if distribution_diff is not None else None,
        "distribution_p": np.mean([round(d.pvalue, 4) for d in distribution_diff]) if distribution_diff is not None else None,
        "disagreement_corr": round(disagreement_corr.statistic, 4),
        "disagreement_corr_p": round(disagreement_corr.pvalue, 4),
    }
        
def eval_batch_intrinsic(batch_pred):
    intrinsic_pred = batch_pred['intrinsic']
    # pairwise_pred = batch_pred['pairwise']
    eval_results = []
    for task in task_list:
        tmp_result = eval_intrinsic(intrinsic_pred[task])
        tmp_result['task'] = task
        # tmp_result['intrinsic_single_label'] = intrinsic_results['single_label_corr']
        # tmp_result['intrinsic_distribution'] = intrinsic_results['distribution_p']
        # tmp_result['intrinsic_disagreement'] = intrinsic_results['disagreement_corr']
        # tmp_result['pairwise'] = eval_pairwise(pairwise_pred[task])
        eval_results.append(tmp_result)
    eval_result_df = pd.DataFrame(eval_results)
    # display(eval_result_df)
    return eval_result_df


In [22]:
batch_pred_gpt4 = pickle.load(open(output_dir + 'pickles/batch_pred_gpt4.pkl', 'rb'))
eval_batch_intrinsic(batch_pred_gpt4)

Unnamed: 0,single_label_corr,single_label_corr_p,distribution_ks,distribution_p,disagreement_corr,disagreement_corr_p,task
0,0.7408,0.0002,,,0.0983,0.6802,atypicality
1,0.7845,0.0,,,0.1371,0.5643,creativity
2,0.7643,0.0001,,,0.105,0.6595,originality


In [23]:
batch_pred_llava_7b = pickle.load(open(output_dir + 'pickles/batch_pred_llava-hf-llava-v1.6-mistral-7b-hf.pkl', 'rb'))
eval_batch_intrinsic(batch_pred_llava_7b)

Unnamed: 0,single_label_corr,single_label_corr_p,distribution_ks,distribution_p,disagreement_corr,disagreement_corr_p,task
0,0.03,0.9,0.406005,0.15517,0.1289,0.5881,atypicality
1,0.1549,0.5144,0.512,0.148515,0.3104,0.1829,creativity
2,0.2312,0.3267,0.33334,0.424415,0.2385,0.3113,originality


In [24]:
batch_pred_llava_7b = pickle.load(open(output_dir + 'pickles/batch_pred_llava-hf-llava-v1.6-vicuna-13b-hf.pkl', 'rb'))
eval_batch_intrinsic(batch_pred_llava_7b)

  disagreement_corr = stats.spearmanr(pred_disagreements, true_disagreements) #.statistic # get pearson r


Unnamed: 0,single_label_corr,single_label_corr_p,distribution_ks,distribution_p,disagreement_corr,disagreement_corr_p,task
0,0.3124,0.1799,0.48533,0.120605,0.1458,0.5398,atypicality
1,-0.1277,0.5915,0.63867,0.013125,,,creativity
2,0.5264,0.0171,0.472665,0.20051,-0.4729,0.0352,originality


In [25]:
# batch_pred_gpt4_pairwise = batch_pred_gpt4['pairwise']
# batch_pred_gpt4_pairwise['atypicality'][:5]

In [26]:
# batch_pred_llava_7b_pairwise = batch_pred_llava_7b['pairwise']
# batch_pred_llava_7b_pairwise['atypicality'][:5]

# 3 Pairwise Eval

In [27]:
from sklearn.metrics import accuracy_score, f1_score

In [32]:
def eval_pairwise_single(pairwise_pred_task):
    correct = 0
    all_target = []
    all_pred = [pred['labels'][0] for pred in pairwise_pred_task]
    for pred in pairwise_pred_task:
        if pred['true'] > 0:
            all_target.append(1)
        else:
            all_target.append(2)

    # filter out str predictions 
    all_pred_cleaned = []
    all_target_cleaned = []
    num_dropped = 0
    for i in range(len(all_pred)):
        if isinstance(all_pred[i], str): 
            num_dropped += 1
            continue 
        all_pred_cleaned.append(all_pred[i])
        all_target_cleaned.append(all_target[i])
    # print('# of dp dropped:', num_dropped)
        
    all_pred = all_pred_cleaned
    all_target = all_target_cleaned
    
    # for pred in pairwise_pred_task:
    #     if pred['true'] > 0 and pred['labels'][0] == 1:
    #         correct += 1
    #     elif pred['true'] < 0 and pred['labels'][0] == 2: ## TODO Check correctness
    #         correct += 1
    # print(correct)
    if len(pairwise_pred_task) == 0: return 0, 0, 0
    # return round(correct / len(pairwise_pred_task), 4)

    
    acc = round(accuracy_score(all_target, all_pred), 4)
    f1 = round(f1_score(all_target, all_pred), 4)
    return acc, f1, num_dropped

def eval_batch_pairwise(batch_pred):
    # intrinsic_pred = batch_pred['intrinsic']
    pairwise_pred = batch_pred['pairwise']
    eval_results = []
    for task in task_list:
        # tmp_result = eval_intrinsic(intrinsic_pred[task])
        tmp_result = {'task': task}
        # tmp_result['intrinsic_single_label'] = intrinsic_results['single_label_corr']
        # tmp_result['intrinsic_distribution'] = intrinsic_results['distribution_p']
        # tmp_result['intrinsic_disagreement'] = intrinsic_results['disagreement_corr']
        acc, f1, num_dropped = eval_pairwise_single(pairwise_pred[task])
        print('task:', task, '# of dp dropped:', num_dropped)
        tmp_result['pairwise_accuracy'] = acc
        tmp_result['pairwise_f1'] = f1
        tmp_result['pred_1'] = sum([pred['labels'][0] == 1 for pred in batch_pred['pairwise'][task]]) / len(batch_pred['pairwise'][task])
        tmp_result['pos_labels'] = sum([pred['true'] > 0 for pred in batch_pred['pairwise'][task]]) / len(batch_pred['pairwise'][task])
        tmp_result['dp_count'] = len(batch_pred['pairwise'][task])
        eval_results.append(tmp_result)
    eval_result_df = pd.DataFrame(eval_results)
    # display(eval_result_df)
    return eval_result_df

In [33]:
batch_pred_llava_7b_pairwise = pickle.load(open('../data/outputs/pickles/batch_pred_llava-hf-llava-v1.6-mistral-7b-hf_0611_175821.pkl', 'rb'))
eval_batch_pairwise(batch_pred_llava_7b_pairwise)

task: atypicality # of dp dropped: 0
task: creativity # of dp dropped: 0
task: originality # of dp dropped: 0


Unnamed: 0,task,pairwise_accuracy,pairwise_f1,pred_1,pos_labels,dp_count
0,atypicality,0.5189,0.6277,0.5,0.792453,106
1,creativity,0.6721,0.7872,0.672131,0.868852,61
2,originality,0.7755,0.8736,0.959184,0.816327,98


In [34]:
batch_pred_llava_13b_pairwise = pickle.load(open('../data/outputs/pickles/batch_pred_llava-hf-llava-v1.6-vicuna-13b-hf_0611_224428.pkl', 'rb'))
eval_batch_pairwise(batch_pred_llava_13b_pairwise)

task: atypicality # of dp dropped: 0
task: creativity # of dp dropped: 0
task: originality # of dp dropped: 0


Unnamed: 0,task,pairwise_accuracy,pairwise_f1,pred_1,pos_labels,dp_count
0,atypicality,0.7264,0.8221,0.745283,0.792453,106
1,creativity,0.7049,0.8125,0.704918,0.868852,61
2,originality,0.8673,0.9222,0.887755,0.816327,98


In [35]:
batch_pred_gpt4_pairwise = pickle.load(open('../data/outputs/pickles/batch_pred_gpt4_0612_205705.pkl', 'rb'))
eval_batch_pairwise(batch_pred_gpt4_pairwise)

task: atypicality # of dp dropped: 0
task: creativity # of dp dropped: 0
task: originality # of dp dropped: 0


Unnamed: 0,task,pairwise_accuracy,pairwise_f1,pred_1,pos_labels,dp_count
0,atypicality,0.7647,0.7846,0.554622,0.537815,119
1,creativity,0.8966,0.88,0.37931,0.482759,58
2,originality,0.9245,0.92,0.471698,0.471698,106


In [13]:
pd.read_csv(mturk_data_dir + "subset_0.5/modeling_atypicality_average_diff.csv").atypicality_average_diff.apply(lambda x: abs(x) > 0.5).sum()

109

In [15]:
pd.read_csv(mturk_data_dir + "subset_0.5/modeling_creativity_average_diff.csv").creativity_average_diff.apply(lambda x: abs(x) > 0.5).sum()

44

In [16]:
pd.read_csv(mturk_data_dir + "subset_0.5/modeling_originality_average_diff.csv").originality_average_diff.apply(lambda x: abs(x) > 0.5).sum()

100