Plot the results in a barplot with confidence bounds.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
sns.set()

plt.rcParams.update({'figure.figsize': (8, 4)})

%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# df0 = pd.read_csv("robustness_benchmark_large_sample/result_table_aggregated.csv")
# df1 = pd.read_csv("robustness_benchmark_large_sample/result_table_aggregated_new.csv")
# cols = df0.columns
# print(cols)
# print(len(df0.columns))
# print(len(df1.columns))
# df = pd.concat([df0[cols], df1[cols]]).reset_index(drop=True)
# print(df.head())
# df.to_csv("robustness_benchmark_large_sample/result_table_aggregated_updated.csv", index=False)

In [None]:
filename = "result_table_aggregated_updated"
df = pd.read_csv(f"../robustness_benchmark_large_sample/{filename}.csv")
# only zero-shot ex
df = df.loc[(df['num_demos'] == 0)]
# use the given template for each card
card_templates = [('cards.robustness.social_stigma', 0), ('cards.robustness.boolq', 1), 
                  ('cards.robustness.pop_qa', 0), ('cards.robustness.social_IG_abuse', 0)]
condition = [np.logical_and(df['card'] == cc[0], df['template_card_index'] == cc[1]) for cc in card_templates]
df = df.loc[np.logical_or.reduce(condition)]
df['card'] = [xx[17:] for xx in df['card']]

# rename columns
df.rename(columns={"model_name_or_path": "model", "card": "dataset"}, inplace=True)
# delete 'bam' from models
df['model'] = [xx[4:] for xx in df['model']]

# separate models into a specific order and grouping for the plotting
models_dict = {"IBM": {"ibm/granite-13b-chat-v2": "Granite-Chat (13B)", "ibm/granite-13b-instruct-v2": "Granite-Instruct (13B)"},
               "Meta": {"meta-llama/llama-2-13b-chat": "Llama2-Chat (13B)", "meta-llama/llama-3-70b-instruct": "Llama3-Instruct (70B)"},
               "MistralAI": {"mistralai/mixtral-8x7b-instruct-v01": "Mixtral-Instruct (8×7B)"},
               "Google": {"google/flan-t5-xxl": "Flan-T5-XXL (11B)", "google/flan-ul2": "Flan-UL2 (20B)"},
               }

models_dict_rename = {}
for vv in models_dict.values():
    for kkk, vvv in vv.items():
        models_dict_rename[kkk] = vvv
        
print(models_dict_rename)

# re-code the models
models_to_keep = [kk for kk in models_dict_rename.keys()]
model_levels = [vv for vv in models_dict_rename.values()]

print(models_to_keep)

# drop llama-3-8b and any other non-included model
df = df.loc[np.isin(df["model"], models_to_keep)]
# now rename
df["model"] = [models_dict_rename[kk] for kk in df["model"]]
# specify an order to the models so can sort
df["model"] = pd.Categorical(df["model"], categories=model_levels, ordered=True)
df.sort_values(by="model", inplace=True)

# rename values
card_name = {"social_stigma": "social stigma", "social_IG_abuse": "SIGA", "pop_qa": "PopQA", "boolq": "BoolQ"}
df['dataset'] = [card_name[xx] for xx in df['dataset']]
# drop social stigma since no longer included
df = df.loc[df["dataset"] != "social stigma"]
print(df.head())

In [None]:
# metric columns
# {} is where we substitute accuracy/string_containment
metric_prefix = ['metric_fixed_group_mean_original_{}',
                 'metric_fixed_group_mean_allvariants_{}',
                 'metric_fixed_group_norm_cohens_h_allvariants_{}',
                 'metric_fixed_group_absval_norm_cohens_h_allvariants_{}',
                 'metric_fixed_group_pdr_allvariants_{}']

metric_prefix_short = [mp[19:].format('score') for mp in metric_prefix]
pretty_names = ["mean score (original)",
                "mean score (all perturbations)",
                "normalized Cohen's $h$",
                "absolute value normalized Cohen's $h$",
                "PDR"]

# concatenate and rename the columns
dfg = df.groupby('dataset')
# the score to use for each
# PopQA uses string containment, the others use accuracy
score = {kk: 'accuracy' if kk != 'PopQA' else 'string_containment' for kk in dfg.groups.keys()}

# rename all columns to refer to score, drop the 'metric_fixed_group_' prefix
selected = [dfgg[['dataset', 'model'] + [mp.format(score[card]) + suff for suff in ['', '_ci_low', '_ci_high'] for mp in metric_prefix]].rename(columns={mp.format(score[card]) + suff: mp.format('score')[19:] + suff for suff in ['', '_ci_low', '_ci_high'] for mp in metric_prefix})            
 for card, dfgg in dfg]

# concatenated
df_sel = pd.concat(selected).reset_index(drop=True)


In [None]:
# calculate average across columns
df_sel_grp = df_sel.groupby('model')
model_averages_across_data = df_sel_grp[['mean_original_score', 'norm_cohens_h_allvariants_score', 'absval_norm_cohens_h_allvariants_score']].mean()

# https://stackoverflow.com/questions/65070070/highlight-the-best-value-of-each-row-in-python-pandas-to-latex
# print(model_averages_across_data.style.format(precision=3).highlight_min(axis=0, props="font-weight:bold;").to_latex(convert_css=True))

for _, dfgg in df_sel_grp[['mean_original_score', 'norm_cohens_h_allvariants_score', 'absval_norm_cohens_h_allvariants_score']]:
    print(dfgg)
    
    # print(-0.2/np.pi <= dfgg <= 0.2/np.pi)

In [None]:
# https://stackoverflow.com/questions/75946037/custom-errorbars-for-catplot-with-grouped-bars-in-facets/75952848#75952848

fig, axes = plt.subplots(3, 2, figsize=(9, 10), sharex=True, sharey=False, dpi=300)
# iteate through the axes and groupby objects
axes = axes.flat

nmodels = len(np.unique(df_sel['model']))
ncards = len(np.unique(df_sel['dataset']))
w = 0.8 #100 / (nmodels * ncards)

def spectral_palette(n):
    import seaborn as sns

    # return a list of tuples specifying n equally spaced colors in the Spectral colormap, used to code n levels
    n = int(n)
    assert n >= 1 and n <= 256
    return sns.color_palette(palette="Spectral", n_colors=n, as_cmap=False).as_hex()

# use same color for model 'key'
bar_colors = np.repeat(spectral_palette(len(models_dict)), repeats=[len(vv) for vv in models_dict.values()])
# first hatch in model group has '', then '/', etc.
hatch_patterns_order = ['', '/', '\\\\\\']
hatches_per_card = np.concatenate([hatch_patterns_order[:len(vv)] for vv in models_dict.values()])
# now repeat so thi
hatches_per_cell = np.repeat(hatches_per_card, ncards)
                       
# hatches = np.repeat(('', '/', '','/','\\\\\\','','/','\\\\\\'), ncards)


for ax, metric, pretty in zip(axes, metric_prefix_short, pretty_names):
    
    # get the error columns
    error_data = df_sel[[metric + suff for suff in ["", "_ci_low", "_ci_high"]] + ["dataset", "model"]].copy()
    
    # pivot the val column for plotting
    data = error_data[[metric, "dataset", "model"]].pivot(index='dataset', columns='model', values=metric)

    
    # sort the columns so bars are plotted in the same position in each axes
    data = data.sort_index(axis=1)

    # plot the bars for data
    data.plot(kind='bar', ax=ax, rot=0, yticks=np.arange(0, 1.1, .1), title=pretty, width=w, 
              edgecolor='black', linewidth=0.2, color=bar_colors,
             fontsize='large')
    ax.title.set_size('large')
    # set hatches
    for bar, hatch in zip(ax.patches, hatches_per_cell):
        bar.set_hatch(hatch)
    
    # colormap='gray', 
    
    # iterate through each bar container
    for c in ax.containers:
        # get the label of the bar
        label = c.get_label()
        
        # select the appropriate error data
        eb = error_data[error_data.model.eq(label)]
        
        # get the center x value of the existing bars
        x = [center[0] for v in c if (center := v.get_center()).any() and center[1] != 0]
        
        # if eb isn't empty for the current label, add the vertical lines
        if not eb.empty:
            ax.vlines(x, ymin=eb[metric + "_ci_low"], ymax=eb[metric + "_ci_high"], color='r')

    # change the yaxis to percent
    # ax.yaxis.set_major_formatter(PercentFormatter(1))
    
    # remove the spines the match catplot
    ax.spines[['right', 'top']].set_visible(False)
    
    if metric == 'norm_cohens_h_allvariants_score':
        lims = ax.get_ylim()
        thresh = np.array([0.2, -0.2]) / np.pi
        thresh = np.array([tt for tt in thresh if lims[0] <= tt <= lims[1]])
        for tt in thresh:
            ax.axhline(y=tt, linewidth=0.5)
    
# extract the axes level legend properties
handles, labels = axes[0].get_legend_handles_labels()
#labels = [lab.split("/") for lab in labels]
#labels = [f'{lab[1]} ({lab[0]})' for lab in labels]

# remove all the axes level legends
for ax in axes:
    ax.legend().remove()
# https://stackoverflow.com/questions/44980658/remove-the-extra-plot-in-the-matplotlib-subplot
axes[-1].set_visible(False)
# axes[-3].xaxis.set_tick_params(which='both', labelbottom=True, labelsize='large')

# # add a figure level legend
fig.legend(handles, labels, title='Model', loc='outside right center', frameon=False, bbox_to_anchor=(0.95, 0.2), fontsize='large')
plt.tight_layout()
plt.savefig('../figures/metric_summary.png')
plt.show()
