Visualize trend of PDR and Cohen's h for various values.

In [None]:
import sys
import os
import numpy as np
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams.update({'figure.figsize': (8, 4)})

%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# clone unitxt from https://github.com/IBM/unitxt
unitxt_src_dir = os.path.realpath(os.path.join("..", "..", "FMEVAL", "unitxt", "src"))

sys.path.append(unitxt_src_dir)
# import the metrics from there
from unitxt.metrics import normalized_cohens_h, performance_drop_rate, normalized_hedges_g

In [None]:
p1 = np.linspace(0, 1, 11).round(1)
p2 = np.linspace(0, 1, 11).round(1)
pp1 = []
pp2 = []
cohen = []
pdr = []
pstat = []
hedges = []
hedges_low = []
hedges_hi = []

def proportions_diff(p1, p2, n1, n2):
    phat = (n1*p1 + n2*p2)/(n1 + n2)
    return (p2 - p1)/np.sqrt(phat*(1-phat)*(1/n1 + 1/n2))

nparaphrase = 5
nreps = 200

for ii in p1:
    for jj in p2:
        subgroup_scores_dict = {"original": [ii], "paraphrase": [jj]}
        pp1.append(ii)
        pp2.append(jj)
        cohen.append(normalized_cohens_h(subgroup_scores_dict, ["original"], ["paraphrase"]))
        pdr.append(performance_drop_rate(subgroup_scores_dict, ["original"], ["paraphrase"]))
        pstat.append(proportions_diff(ii, jj, 1, 5))
        
        # for hedges, resample nreps times
        # assume a single original draw and nparaphrase draws with that proportion
        original = np.random.binomial(n=1, p=ii, size=(nreps,))
        paraphrases = np.random.binomial(n=1, p=jj, size=(nreps, nparaphrase))
        hedges_g_sample = np.array([normalized_hedges_g({"original": [aa], "paraphrase": bb.tolist()}, ["original"], ["paraphrase"])
                                  for aa, bb in zip(original, paraphrases)])
        hedges.append(np.mean(hedges_g_sample))
        hedges_low.append(np.quantile(hedges_g_sample, 0.025))
        hedges_hi.append(np.quantile(hedges_g_sample, 0.975))


df = pd.DataFrame({"p1": pp1, "p2": pp2, "normalized Cohen's h": cohen, "PDR": pdr, 
                   "proportion_diff_stat": pstat, "hedges_g": hedges, "hedges_hi": hedges_hi, "hedges_low": hedges_low})
df["reverse PDR"] = -1 * df["PDR"]

In [None]:
sns.scatterplot(data=df, x="normalized Cohen's h", y="reverse PDR", marker="o")
plt.show()

for ii in p1:
    tmp = df.loc[df["p1"] == ii]
    tmp.index = tmp["p2"]
    # tmp[["normalized Cohen's h", "reverse_PDR", "proportion_diff_stat", "hedges_g"]].plot.line(style='.-')
    tmp[["normalized Cohen's h", "reverse PDR"]].plot.line(style=['.-', '.--'], fontsize='large', ms=12)
    plt.title(r'baseline accuracy ($score_i^o$) = ' + f'{ii}', fontsize='large')
    plt.xlabel(r'mean accuracy over perturbations ($score_i^p$)', fontsize='large')
    plt.ylabel('metric value', fontsize='large')
    plt.legend(fontsize='large')
    plt.tight_layout()
    if ii == 1.0:
        plt.savefig('../figures/pdr_h_comparison.png')
    plt.show()

    


for ii in p2:
    tmp = df.loc[df["p2"] == ii]
    tmp.index = tmp["p1"]
    tmp[["normalized Cohen's h", "reverse PDR"]].plot.line(style=['.-', '.--'], fontsize='large', ms=12)
    plt.title(r'baseline accuracy ($p_{i,0}$) = ' + f'{ii}', fontsize='large')
    plt.legend(fontsize='large')
    plt.xlabel(r'mean accuracy over perturbations ($p_{i,1}$)', fontsize='large')
    plt.ylabel('metric value', fontsize='large')
    plt.show()


In [None]:
# calculate correlation
p = np.linspace(start=0, stop=1, num=1000)
grp_dicts = [{"original": [1.0], "paraphrase": [ii]} for ii in p]
cohen = np.array([normalized_cohens_h(d, ["original"], ["paraphrase"]) for d in grp_dicts])
rev_pdr = 1 - np.array([performance_drop_rate(d, ["original"], ["paraphrase"]) for d in grp_dicts])
np.corrcoef(cohen, rev_pdr)



In [None]:
# calculate Cohen's h for specific example
subgroup_scores_dict = {"original": [0.8], "paraphrase": [0.1]}
# from 0.8 to 0.1
print(normalized_cohens_h(subgroup_scores_dict, ["original"], ["paraphrase"]))
print(normalized_cohens_h(subgroup_scores_dict, ["original"], ["paraphrase"]) * np.pi)

# from 0.1 to 0.8
print(normalized_cohens_h(subgroup_scores_dict, ["paraphrase"], ["original"]))