In [None]:
import os, re, sys, json, string, csv, math
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

In [None]:
sys.path.append('~/gpt-writing-prompts/')

In [None]:
import story_analysis.utils as kp_utils
import story_analysis.attr_score_funcs as kp_funcs
import story_analysis.run_for_data as kp_run_for_data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import scipy.stats as stats

In [None]:
import random

## Analysis: create dataframes

In [None]:
# read data + metadata (pov info)

In [None]:
h_info = pd.read_csv('data/meta_info/human_info.csv.gzip', compression='gzip')
m_info = pd.read_csv('data/meta_info/gpt_info.csv.gzip', compression='gzip')

In [None]:
# number of stories by pov
order = ['TP-M', 'TP-F', 'FP', 'SP', 'Other']

In [None]:
tmp1 = h_info.groupby(['p_pov'])['s_pov'].value_counts(normalize=True).unstack()
tmp1['group'] = 'human'

tmp2 = m_info.groupby(['p_pov'])['s_pov'].value_counts(normalize=True).unstack()
tmp2['group'] = 'gpt-3.5'
tmp = pd.concat([tmp1, tmp2])

display(tmp[order])

In [None]:
# Figure 1
fig, axs = plt.subplots(figsize=(4, 6), nrows=2, ncols=1, sharex=True, sharey=True)
# h_info.groupby(['p_pov', 's_pov']).size().unstack().loc[order].plot(kind='bar', stacked=True, figsize=(7,5))
h_info.groupby(['p_pov'])['s_pov'].value_counts(normalize=True).unstack().loc[order][order].\
plot(kind='bar', stacked=True, ax=axs[0])
axs[0].set_xlabel("prompt PoV", fontsize=12)
axs[0].set_ylabel("proportion of stories", fontsize=12)
# plt.legend(title='Story PoV')
handles, labels = axs[0].get_legend_handles_labels()
axs[0].get_legend().remove()
axs[0].set_title("Human-written Stories")

# plt.figure()
# h_info.groupby(['p_pov', 's_pov']).size().unstack().loc[order].plot(kind='bar', stacked=True, figsize=(7,5))
m_info.groupby(['p_pov'])['s_pov'].value_counts(normalize=True).unstack().loc[order][order].\
plot(kind='bar', stacked=True, ax=axs[1])
axs[1].set_xlabel("prompt PoV", fontsize=12)
axs[1].set_ylabel("proportion of stories", fontsize=12)
# axs[1].set_ylabel("")
axs[1].get_legend().remove()
axs[1].set_title("GPT-generated Stories")

fig.legend(handles, labels, bbox_to_anchor=(0.1, 0.99), loc='lower left', ncol=3, prop={'size': 12})
plt.tight_layout()

In [None]:
h_info.groupby('p_pov')['p_ind'].nunique().sum()

In [None]:
m_info.groupby('p_pov')['p_ind'].nunique()

In [None]:
# h_info.ren

In [None]:
tmp1 = pd.DataFrame(h_info.groupby('p_pov')['s_pov'].value_counts(normalize=True)).\
rename({'s_pov':'prop'}, axis='columns').reset_index().pivot(columns='s_pov', values='prop', index='p_pov')

In [None]:
tmp1.reset_index(inplace=True)
tmp1['group'] = 'human'

In [None]:
tmp2 = pd.DataFrame(m_info.groupby('p_pov')['s_pov'].value_counts(normalize=True)).\
rename({'s_pov':'prop'}, axis='columns').reset_index().pivot(columns='s_pov', values='prop', index='p_pov')
tmp2.reset_index(inplace=True)
tmp2['group'] = 'gpt-3.5'

In [None]:
tmp = pd.concat([tmp1, tmp2])

In [None]:
tmp

In [None]:
# numbers for figure 1
print(tmp.sort_values(by=['p_pov', 'group'], ascending=False)[['p_pov', 'group']+order].\
      to_latex(float_format='%.2f', index=False))

## Analysis

In [None]:
def read_scores(readf):
    sids, scores = [], []
    
    with open(os.path.join(readf, 'out_scores.csv'), 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            pid, sid, val = row
            
            sid = str(pid) + '-' + str(sid)
            sids.append(sid)
            scores.append(float(val))
    return sids, scores

In [None]:
DIMENSIONS = ['valence', 'arousal', 'dominance', 'power', 'appearance', 'intellect']
ATTR_METHODS = ['all', 'sub', 'comet']
SCORE_METHODS = ['avg', 'axis', 'sim']

In [None]:
order = ['TP-M', 'TP-F', 'FP', 'SP', 'Other']

In [None]:
human_dfs = {}
for dim in DIMENSIONS:
    
    human_dfs[dim] = {}
    for am in ATTR_METHODS:
        human_dfs[dim][am] = {}
        for sm in SCORE_METHODS:
            
            readf = os.path.join('data/story_scores/human', dim, am, sm)
            sids, scores = read_scores(readf)

            df = pd.DataFrame({
                        'dim': dim,
                        'am': am,
                        'sm': sm,
                        's_id': sids,
                        'score': scores
                    })
            df['writer'] = 'human'
            
            if (dim == 'appearance' or dim == 'intellect') and (sm == 'axis'):
                df['score'] = pd.NA
            
            df = df.convert_dtypes()
            ddf = pd.merge(h_info[['p_ind', 's_ind', 's_id', 's_pov']], df, on='s_id', how='left')
            human_dfs[dim][am][sm] = ddf

In [None]:
gpt_dfs = {}
for dim in DIMENSIONS:
   
    gpt_dfs[dim] = {}
    for am in ATTR_METHODS:
        gpt_dfs[dim][am] = {}
        for sm in SCORE_METHODS:
            
            readf = os.path.join('outputs/gen', dim, am, sm)
            sids, scores = read_scores(readf)

            df = pd.DataFrame({
                        'dim': dim,
                        'am': am,
                        'sm': sm,
                        's_id': sids,
                        'score': scores
                    })
            df['writer'] = 'gpt-3.5'
            
            if (dim == 'appearance' or dim == 'intellect') and (am == 'axis'):
                df['score'] = pd.NA
            
            df = df.convert_dtypes()
            ddf = pd.merge(m_info[['p_ind', 's_ind', 's_id', 's_pov']], df, on='s_id', how='left')
            gpt_dfs[dim][am][sm] = ddf

In [None]:
# add z_scores
for dim in DIMENSIONS:
    
    for am in ATTR_METHODS:
        for sm in SCORE_METHODS:
            hdf = human_dfs[dim][am][sm]
            mdf = gpt_dfs[dim][am][sm]
            
            all_scores = hdf['score'].tolist() + mdf['score'].tolist()
            
            try:
                z_mean = np.nanmean(all_scores)
                z_std = np.nanstd(all_scores)

                hdf['z_score'] = (hdf['score'] - z_mean) / z_std
                mdf['z_score'] = (mdf['score'] - z_mean) / z_std
            
            except TypeError:
                hdf['z_score'] = hdf['score']
                mdf['z_score'] = mdf['score']
            
            human_dfs[dim][am][sm] = hdf
            gpt_dfs[dim][am][sm] = mdf
            

In [None]:
# set axis to be best method for all bidimensional scales, sim otherwise
best_sm_map = {x: 'axis' for x in DIMENSIONS}

best_sm_map['appearance'] = 'sim'
best_sm_map['intellect'] = 'sim'

In [None]:
# use comet attributes
best_attr = 'comet'

In [None]:
keep_order = ['TP-M', 'TP-F', 'FP', 'Other']

In [None]:
# overall story z-scores (not in paper)

fig, axs = plt.subplots(nrows = 2, ncols = 3, figsize=(15, 8))
i = 0
j = 0
for dim in DIMENSIONS:
    h_scores = human_dfs[dim]['all']['avg']['z_score'].tolist()
    m_scores = gpt_dfs[dim]['all']['avg']['z_score'].tolist()
    
    axs[i][j].hist(h_scores, label='human', alpha=0.5)
    axs[i][j].hist(m_scores, label='gpt-3.5', alpha=0.5)
    axs[i][j].set_title(dim)
    axs[i][j].legend()
    
    j += 1
    if j == 3:
        i += 1
        j = 0
plt.tight_layout()

In [None]:
fig, axs = plt.subplots(nrows = 1, ncols = 3, figsize=(12, 4), sharey=True)
i = 0
for dim in DIMENSIONS[:3]:
    h_scores = human_dfs[dim]['all']['avg']['score'].tolist()
    m_scores = gpt_dfs[dim]['all']['avg']['score'].tolist()
    
    axs[i].hist(h_scores, label='human', alpha=0.5)
    axs[i].hist(m_scores, label='gpt-3.5', alpha=0.5)
    axs[i].set_title(dim.title())
    axs[i].legend()
    
    i += 1
    
plt.tight_layout()

In [None]:
# overall absolute story scores (not in paper)

fig, axs = plt.subplots(nrows = 3, ncols = 2, figsize=(6, 8), sharex=True)
i = 0
j = 0
for dim in DIMENSIONS:
    hdf = human_dfs[dim]['all']['avg']#['z_score'].tolist()
    mdf = gpt_dfs[dim]['all']['avg']#['z_score'].tolist()
    
    bdf = pd.concat([hdf, mdf])
    bdf.dropna(subset=['s_pov', 'score'])
    
    sns.pointplot(data=bdf, x='s_pov', y='score', hue='writer', join=False, ax=axs[i][j], order=keep_order)
    handles, labels = axs[i][j].get_legend_handles_labels()
    axs[i][j].get_legend().remove()
    axs[i][j].grid(which='both')

    axs[i][j].set_title(dim.title(), fontsize=15)
    if j == 0:
        axs[i][j].set_ylabel("score", fontsize=15)
    else:

    axs[i][j].set_xlabel("")
    axs[i][j].xaxis.set_tick_params(labelsize=15)

    j += 1
    if j == 2:
        i += 1
        j = 0
fig.legend(handles, labels, bbox_to_anchor=(0.3, 0.99), loc='lower left', ncol=2, prop={'size': 12})
plt.tight_layout()

In [None]:
# overall story z-scores (not in paper)

fig, axs = plt.subplots(nrows = 3, ncols = 2, figsize=(6, 8), sharex=True)
i = 0
j = 0
for dim in DIMENSIONS:
    hdf = human_dfs[dim]['all']['avg']#['z_score'].tolist()
    mdf = gpt_dfs[dim]['all']['avg']#['z_score'].tolist()
    
    bdf = pd.concat([hdf, mdf])
    bdf.dropna(subset=['s_pov', 'z_score'])
    
    sns.pointplot(data=bdf, x='s_pov', y='z_score', hue='writer', join=False, ax=axs[i][j], order=keep_order)
    handles, labels = axs[i][j].get_legend_handles_labels()
    axs[i][j].get_legend().remove()
    axs[i][j].grid(which='both')

    axs[i][j].set_title(dim.title(), fontsize=15)
    if j == 0:
        axs[i][j].set_ylabel("z_score", fontsize=15)
    else:
        axs[i][j].set_ylabel("")
#     if i == 2:
# #         axs[i][j].set_xlabel("story PoV", fontsize=15)
#     else:
    axs[i][j].set_xlabel("")
    axs[i][j].xaxis.set_tick_params(labelsize=15)
#     axs[i][j].legend()
    
    j += 1
    if j == 2:
        i += 1
        j = 0
fig.legend(handles, labels, bbox_to_anchor=(0.3, 0.99), loc='lower left', ncol=2, prop={'size': 12})
plt.tight_layout()

In [None]:
# protagonist attribute z-scores (not in paper)
fig, axs = plt.subplots(nrows = 3, ncols = 2, figsize=(6, 8), sharex=True)
i = 0
j = 0
for dim in DIMENSIONS:
    hdf = human_dfs[dim][best_attr][best_sm_map[dim]]#['z_score'].tolist()
    mdf = gpt_dfs[dim][best_attr][best_sm_map[dim]]#['z_score'].tolist()
    
    bdf = pd.concat([hdf, mdf])
    bdf.dropna(subset=['s_pov', 'z_score'])
    
    sns.pointplot(data=bdf, x='s_pov', y='z_score', hue='writer', join=False, ax=axs[i][j], order=keep_order)
    handles, labels = axs[i][j].get_legend_handles_labels()
    axs[i][j].get_legend().remove()
    axs[i][j].grid(which='both')
    axs[i][j].set_title(dim.title(), fontsize=15)
    if j == 0:
        axs[i][j].set_ylabel("z_score", fontsize=15)
    else:
        axs[i][j].set_ylabel("")

    axs[i][j].set_xlabel("")
    axs[i][j].xaxis.set_tick_params(labelsize=15)
    
    j += 1
    if j == 2:
        i += 1
        j = 0
fig.legend(handles, labels, bbox_to_anchor=(0.3, 0.99), loc='lower left', ncol=2, prop={'size': 12})
plt.tight_layout()

In [None]:
# correlations between avg and best_sm_method
rows = []
for dim in DIMENSIONS:
    for am in ATTR_METHODS:
        scores_1 = human_dfs[dim][am]['avg']['score'].tolist()
        scores_2 = human_dfs[dim][am][best_sm_map[dim]]['score'].tolist()
        
        corr = stats.spearmanr(scores_1, scores_2, nan_policy='omit')
        
        rows.append([dim, am, corr[0], corr[1]])

In [None]:
corr_df = pd.DataFrame(rows, columns=['dim', 'attr_method', 'sp_corr', 'sp_pval'])

In [None]:
corr_df

In [None]:
# correlations between all, sim, comet 
# rows = []
for dim in DIMENSIONS[-2:]:
    for sm in SCORE_METHODS:
        if sm  == 'axis':
            continue
        scores_1 = human_dfs[dim]['all'][sm]['z_score'].tolist()
        scores_2 = human_dfs[dim]['sub'][sm]['z_score'].tolist()
        scores_3 = human_dfs[dim]['comet'][sm]['score'].tolist()
        
        corr1 = stats.spearmanr(scores_1, scores_2, nan_policy='omit')
        corr2 = stats.spearmanr(scores_1, scores_3, nan_policy='omit')
        corr3 = stats.spearmanr(scores_2, scores_3, nan_policy='omit')
        
        rows.append([dim, sm, corr1[0], corr1[1], corr2[0], corr2[1],corr3[0], corr3[1]])

In [None]:
len(rows)

In [None]:
a_corr_df = pd.DataFrame(rows, columns=['dim', 'am', 'all-sub-c', 'all-sub-p', 'all-comet-c', 'all-comet-p',\
                                       'sub-comet-c', 'sub-comet-p'])

In [None]:
a_corr_df

#### Significance tests

In [None]:
# human vs gpt
rows = []
for dim in DIMENSIONS:
    for am in ATTR_METHODS:
        hdf = human_dfs[dim][am][best_sm_map[dim]]
        mdf = gpt_dfs[dim][am][best_sm_map[dim]]
        
        hscores = hdf['z_score'].dropna().tolist()
        mscores = mdf['z_score'].dropna().tolist()
        
        tt, pval = stats.ttest_ind(hscores, mscores)
        
        rows.append([dim, am, tt, pval])

In [None]:
pd.DataFrame(rows)

In [None]:
# matched
# human vs gpt
rows = []
for dim in DIMENSIONS:
    for am in ATTR_METHODS:
        hdf = human_dfs[dim][am][best_sm_map[dim]]
        mdf = gpt_dfs[dim][am][best_sm_map[dim]]
        
        hagg = hdf.groupby('p_ind')['z_score'].agg(np.nanmean).reset_index().dropna(subset=['z_score'])
        magg = mdf.groupby('p_ind')['z_score'].agg(np.nanmean).reset_index().dropna(subset=['z_score'])
        
        merged = hagg.merge(magg, on='p_ind', how='inner')
        
        hscores = merged['z_score_x'].dropna().tolist()
        mscores = merged['z_score_y'].dropna().tolist()
        
        tt, pval = stats.ttest_rel(hscores, mscores)
        
        rows.append([dim, am, tt, pval])

In [None]:
pd.DataFrame(rows)

In [None]:
# male vs female
rows = []
for dim in DIMENSIONS:
    for am in ATTR_METHODS:
        hdf = human_dfs[dim][am][best_sm_map[dim]]
#         mdf = gpt_dfs[dim][am][best_sm_map[dim]]
        
       
        mscores = hdf[hdf['s_pov']=='TP-M']['z_score'].dropna().tolist()
        fscores = hdf[hdf['s_pov']=='TP-F']['z_score'].dropna().tolist()
        
        tt, pval = stats.ttest_ind(fscores, mscores)
        
        rows.append([dim, am, tt, pval])

In [None]:
pd.DataFrame(rows)

In [None]:
# male vs female
rows = []
for dim in DIMENSIONS:
    for am in ATTR_METHODS:
        hdf = gpt_dfs[dim][am][best_sm_map[dim]]
        
       
        mscores = hdf[hdf['s_pov']=='TP-M']['z_score'].dropna().tolist()
        fscores = hdf[hdf['s_pov']=='TP-F']['z_score'].dropna().tolist()
        
        tt, pval = stats.ttest_ind(fscores, mscores)
        
        rows.append([dim, am, tt, pval])

In [None]:
pd.DataFrame(rows)

## protagonist groups

In [None]:
plt.rcParams.update(plt.rcParamsDefault)

In [None]:
# Figure 2
fig, axs = plt.subplots(figsize=(12, 6), nrows=2, ncols=3, sharey=True, sharex=True)
i = 0
j = 0

for dim in DIMENSIONS:
    # comet, best
    hdf = human_dfs[dim]['comet'][best_sm_map[dim]]
    mdf = gpt_dfs[dim]['comet'][best_sm_map[dim]]
    
    bdf = pd.concat([hdf, mdf])
    
    bdf.dropna(subset=['s_pov', 'z_score'], inplace=True)
    
    sns.pointplot(data=bdf, x='s_pov', y='z_score', hue='writer', order = keep_order,
                join=False,  markers="x", ax=axs[i][j],markersize=20, scale=2.0, errorbar='ci')
    
    
    # sub, best
    hdf = human_dfs[dim]['sub'][best_sm_map[dim]]
    mdf = gpt_dfs[dim]['sub'][best_sm_map[dim]]
    
    bdf = pd.concat([hdf, mdf])
    
    bdf.dropna(subset=['s_pov', 'z_score'], inplace=True)
    
    sns.pointplot(data=bdf, x='s_pov', y='z_score', hue='writer', order = keep_order,
                join=False, markers="x", markersize=17, ax=axs[i][j], \
                scale=1.5, alpha=0.5, palette='pastel')

    axs[i][j].set_title(dim.title(), fontsize=18)
    if i == 1 and j == 1:
        axs[i][j].set_xlabel("story PoV", fontsize=18)
    
    else:
        axs[i][j].set_xlabel("")
    
    if j == 0:
        axs[i][j].set_ylabel("score", fontsize=18)
        axs[i][j].tick_params(axis='y', labelsize=15)
    else:
        axs[i][j].set_ylabel("")
        
    if i == 1:
        axs[i][j].tick_params(axis='x', labelsize=15)

    handles, labels = axs[i][j].get_legend_handles_labels()
    axs[i][j].get_legend().remove()
    axs[i][j].grid(which='both')
    j+=1
    if j==3:
        i += 1
        j = 0
    


new_labels = [labels[0] + '-comet', labels[1]+'-comet', labels[2]+'-spacy', labels[3]+'-spacy']
fig.legend(handles, new_labels, bbox_to_anchor=(.5, 1.), loc='lower center', ncol=2, prop={'size': 14})

plt.tight_layout()

In [None]:
order = ['TP-M', 'TP-F', 'FP', 'Other']

In [None]:
human_dfs.keys()

In [None]:
# table 3
rows = []
for dim in DIMENSIONS:
    hdf = human_dfs[dim][best_attr][best_sm_map[dim]]
    mdf = gpt_dfs[dim][best_attr][best_sm_map[dim]]
    
    h_means = hdf.groupby('s_pov')['z_score'].agg(np.nanmean)
    m_means = mdf.groupby('s_pov')['z_score'].agg(np.nanmean)
    
    h_stds = hdf.groupby('s_pov')['z_score'].agg(np.nanstd)
    m_stds = mdf.groupby('s_pov')['z_score'].agg(np.nanstd)
    
    h_scores = ["{:.2f} ({:.2f})".format(h_means[o], h_stds[o]) for o in order]
    m_scores = ["{:.2f} ({:.2f})".format(m_means[o], m_stds[o]) for o in order]
    
    rows.append([dim, 'human'] + h_scores)
    rows.append([dim, 'gpt-3.5'] + m_scores)

In [None]:
print(pd.DataFrame(rows, columns=['Dim', 'Writer'] + order).to_latex(float_format='%.3f', index=False))

In [None]:
# table 3 but for sub attributes (not in paper)
rows = []
for dim in DIMENSIONS:
    hdf = human_dfs[dim]['sub'][best_sm_map[dim]]
    mdf = gpt_dfs[dim]['sub'][best_sm_map[dim]]
    
    h_means = hdf.groupby('s_pov')['z_score'].agg(np.nanmean)
    m_means = mdf.groupby('s_pov')['z_score'].agg(np.nanmean)
    
    h_stds = hdf.groupby('s_pov')['z_score'].agg(np.nanstd)
    m_stds = mdf.groupby('s_pov')['z_score'].agg(np.nanstd)
    
    h_scores = ["{:.2f} ({:.2f})".format(h_means[o], h_stds[o]) for o in order]
    m_scores = ["{:.2f} ({:.2f})".format(m_means[o], m_stds[o]) for o in order]
    
    rows.append([dim, 'human'] + h_scores)
    rows.append([dim, 'gpt-3.5'] + m_scores)

In [None]:
print(pd.DataFrame(rows, columns=['Dim', 'Writer'] + order).to_latex(float_format='%.2f', index=False))

## prompt-wise diffs (Figure 3)

In [None]:
import random

In [None]:
# sample 2 human stories for each prompt 5 times, average mean and std
# Match by prompt AND pov label
def get_pov_prompt_scores(odf, n_samples=2, n_trials=5):
    
    df = odf.dropna(subset=['p_ind', 's_pov', 'z_score'])
    
    pov2p2scores = {}
    
    for _, row in df.iterrows():
        pov = row['s_pov']
        pid = row['p_ind']
        score = row['z_score']
        if pov not in pov2p2scores:
            pov2p2scores[pov] = defaultdict(list)
        
        pov2p2scores[pov][pid].append(score)
    
    
    rows = []
    
    for pov in pov2p2scores:
        for pid in pov2p2scores[pov]:
            all_scores = pov2p2scores[pov][pid]
            if len(all_scores) < n_samples:
                continue
                
            means = []
            stds = []
            
            for nt in range(n_trials):
                try:
                    s_s = random.sample(all_scores, n_samples)
                    means.append(np.nanmean(s_s))
                    stds.append(np.nanstd(s_s))
                except ValueError:
                    means.append(np.nanmean(s_s))
                    stds.append(np.nanstd(s_s))
            rows.append([pov, pid, np.nanmean(means), np.nanmean(stds)])
    return pd.DataFrame(rows, columns=['pov', 'pid', 'mean', 'std'])

In [None]:
rename_f = {
    'mean_x': 'human_mean',
    'std_x': 'human_std',
    'mean_y': 'gpt_mean',
    'std_y': 'gpt_std'
}


In [None]:
matched_dfs = []
for dim in DIMENSIONS:
    h_score_df = get_pov_prompt_scores(human_dfs[dim][best_attr][best_sm_map[dim]])
    m_score_df = get_pov_prompt_scores(gpt_dfs[dim][best_attr][best_sm_map[dim]])
    
    h_score_df.dropna(inplace=True)
    m_score_df.dropna(inplace=True)
    
    matched_df = h_score_df.merge(m_score_df, on=['pov', 'pid'], how='inner')
    matched_df['dim'] = dim
    matched_df.rename(rename_f, axis='columns', inplace=True)
    
    matched_dfs.append(matched_df)

In [None]:
all_matched_pov_pid = pd.concat(matched_dfs)

In [None]:
len(all_matched_pov_pid)

In [None]:
# all_matched_pov_pid.to_csv('stats/matched_pov_pid_mean_std.csv', index=False)

# all_matched_pov_pid = pd.read_csv('stats/matched_pov_pid_mean_std.csv')

In [None]:
# matched_pid = h_score_df.merge(m_score_df, on = 'pid', how='inner')

In [None]:
# sample 2 human stories for each prompt 5 times, average mean and std
# match by prompt only, ignore PoV
def get_prompt_scores(odf, n_samples=2, n_trials=5):
    
    df = odf.dropna(subset=['p_ind', 'z_score'])
    
    p2scores = {}
    
    for _, row in df.iterrows():
        pid = row['p_ind']
        score = row['z_score']
        if pid not in p2scores:
            p2scores[pid] = []
        
        p2scores[pid].append(score)
        
    rows = []
    
    for pid in p2scores:
        all_scores = p2scores[pid]
        if len(all_scores) < n_samples:
            continue

        means = []
        stds = []

        for nt in range(n_trials):
            try:
                s_s = random.sample(all_scores, n_samples)
                means.append(np.nanmean(s_s))
                stds.append(np.nanstd(s_s))
            except ValueError:
                means.append(np.nanmean(s_s))
                stds.append(np.nanstd(s_s))
        rows.append([pid, np.nanmean(means), np.nanmean(stds)])
            
    return pd.DataFrame(rows, columns=['pid', 'mean', 'std'])

In [None]:
matched_dfs = []
for dim in DIMENSIONS:
    h_score_df = get_prompt_scores(human_dfs[dim][best_attr][best_sm_map[dim]])
    m_score_df = get_prompt_scores(gpt_dfs[dim][best_attr][best_sm_map[dim]])
    
    h_score_df.dropna(inplace=True)
    m_score_df.dropna(inplace=True)
    
    matched_df = h_score_df.merge(m_score_df, on='pid', how='inner')
    matched_df['dim'] = dim
    matched_df.rename(rename_f, axis='columns', inplace=True)
    
    matched_dfs.append(matched_df)

In [None]:
all_matched_pid = pd.concat(matched_dfs)

In [None]:
# matched_dfs[-5]

In [None]:
# all_matched_pid.to_csv('stats/matched_pid_mean_std.csv', index=False)

# all_matched_pid = pd.read_csv('stats/matched_pid_mean_std.csv')

In [None]:
all_matched_pov_pid.groupby(['dim', 'pov'])['mean_diff'].agg((np.nanmean, np.nanstd))

In [None]:
all_matched_pid.groupby('dim')['mean_diff'].agg((np.nanmean, np.nanstd))
                            

### boxplot of diffs (Fig 3a)

In [None]:
# all_matched_pov_pid['mean_diff'] = all_matched_pov_pid['human_mean'] - all_matched_pov_pid['gpt_mean']
# all_matched_pov_pid['std_diff'] = all_matched_pov_pid['human_std'] - all_matched_pov_pid['gpt_std']

In [None]:
fig, axs = plt.subplots(nrows = 2, ncols = 3, figsize=(20, 8))
i = 0
j = 0

for dim in DIMENSIONS:
    
    ddf = all_matched_pov_pid[all_matched_pov_pid['dim']==dim]
    sns.boxplot(data=ddf, x='mean_diff', y='pov', ax = axs[i][j])
    axs[i][j].set_title(dim.title())
    
    axs[i][j].grid(which='both')
    j+=1
    if j==3:
        i += 1
        j = 0
plt.tight_layout()

In [None]:
plt.figure(figsize=(7, 4))
sns.boxplot(data = all_matched_pid, x='mean_diff', y='dim')
plt.ylabel("Dimension", fontsize=14)
plt.xlabel("Score Difference by Prompt", fontsize=14)
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.grid()

### Fig 3b: human control

In [None]:
import random

In [None]:
random.shuffle([1,2,3])

In [None]:
# split human into two groups
def split_into_groups(hdf):
    hdf = hdf.dropna(subset=['z_score'])
    p2ids = defaultdict(list)
    for x, y in zip(hdf['p_ind'], hdf['s_id']):
        p2ids[x].append(y)
    
    splits_1 = []
    splits_2 = []
    for pid, sids in p2ids.items():
        if len(sids)>=4:
            random.shuffle(sids)
            s = len(sids)//2
            splits_1.extend(sids[:s])
            splits_2.extend(sids[s:])
            
    hdf1 = hdf[hdf['s_id'].isin(splits_1)]
    hdf2 = hdf[hdf['s_id'].isin(splits_2)]
    
    return hdf1, hdf2

In [None]:
rows = []
for trial in range(5):
    for dim in DIMENSIONS:
        hdf = human_dfs[dim]['comet'][best_sm_map[dim]]
        hdf1, hdf2 = split_into_groups(hdf)

        mdf = gpt_dfs[dim]['comet'][best_sm_map[dim]].dropna(subset=['z_score'])

        hagg1 = hdf1.groupby(['p_ind'])['z_score'].agg(np.nanmean).reset_index()
        hagg2 = hdf2.groupby(['p_ind'])['z_score'].agg(np.nanmean).reset_index()

        magg = mdf.groupby(['p_ind'])['z_score'].agg(np.nanmean).reset_index()

        h_control_matched = hagg1.merge(hagg2, on=['p_ind'], how='inner')
        h_control_matched['mean_diff'] = h_control_matched['z_score_x'] - h_control_matched['z_score_y']

        h_m_matched = hagg1.merge(magg, on=['p_ind'], how = 'inner')
        h_m_matched['mean_diff'] = h_m_matched['z_score_x'] - h_m_matched['z_score_y']

#         plt.hist(h_control_matched['mean_diff'], label='control', alpha=0.5)
#         plt.hist(h_m_matched['mean_diff'], label='human-gpt', alpha=0.5)
#         plt.legend()
#         plt.show()

        control_mean = np.nanmean(h_control_matched['mean_diff'])
        hm_mean = np.nanmean(h_m_matched['mean_diff'])

        rows.append([trial, dim, 'control', control_mean])
        rows.append([trial, dim, 'gpt-3.5', hm_mean])

In [None]:
diff_df = pd.DataFrame(rows, columns=['trial', 'dim', 'type', 'diff'])

In [None]:
plt.figure(figsize=(7,4))
sns.pointplot(data=diff_df, x = 'diff', y='dim', hue='type', markers='x', join=False, palette={'control': 'red', 'gpt-3.5': 'blue'})
# sns.pointplot(data=diff_df, x = 'diff', y='dim', markers='X', join=False)
plt.grid(which='both')
plt.xlabel("Mean Difference", fontsize=14)
plt.ylabel("", fontsize=14)
plt.xticks(fontsize=14)
plt.legend(loc='best')
plt.yticks(fontsize=14)