In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import os

In [None]:
# Load the data
# df = pd.read_csv('../outputs/results_cross.csv').sort_values(by=['model', 'target_class', 'num_samples'])
df = pd.read_csv('../outputs/results_exp1a_cross.csv').sort_values(by=['model', 'target_class', 'num_samples'])
excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum", "squad"]
cols = ["gpt-4_lm_score", "gpt-3.5-turbo_lm_score"]
for col in cols:
    df[col] = df[col]/10
# df2.mean_score_pred = df2.mean_scor
df.target_class = df.target_class.apply(lambda x: x.replace("fractial_", ""))
df = df[df.model.apply(lambda x: x not in excluded)]
df.num_samples = df.num_samples.apply(lambda x: int(x))
df

In [None]:
heatmap_1000 = df[df.num_samples == 1000][["model", "target_class", "gpt-4_lm_score"]].pivot(index="model", columns="target_class", values="gpt-4_lm_score")

# Plot a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(heatmap_1000, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
plt.show()

In [None]:
heatmap_0 = df[df.num_samples == 0][["model", "target_class", "gpt-4_lm_score"]].pivot(index="model", columns="target_class", values="gpt-4_lm_score")

# Plot a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(heatmap_0, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
plt.savefig("../outputs/plots/exp1a_heatmap_heldout.pdf")
plt.show()

In [None]:
# Plot a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap((heatmap_1000 - heatmap_0).fillna(0), annot=True, fmt=".2f", cmap="Spectral", ax=ax)
plt.xlabel = "Target class"
plt.ylabel = "Delta Models"
plt.savefig("../outputs/plots/exp1a_heatmap_diff.pdf")
plt.show()

In [None]:
target_col = ["gpt-4_lm_score", "gpt-3.5-turbo_lm_score", "rouge1", "bertscore", "sbertscore", "softmaxed_reward_model_score"] # , "rougeL"]
excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum", "squad_v2"]
df

In [None]:
len(df), len(df.target_class.unique())

In [None]:
df[(df.model.apply(lambda x: x  not in excluded)) & (df.target_class.apply(lambda x: x not in excluded))].groupby("target_class")[target_col].mean()

In [None]:
corr = df[(df.model.apply(lambda x: x  not in excluded)) & (df.target_class.apply(lambda x: x not in excluded))].groupby("target_class")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col] * 100
matrix = np.triu(corr, k=1)

display(corr)
# plot the heatmap
assert (corr.columns == corr.index).all()
clean_cols = ["GPT4", "GPT3.5", "ROUGE", "BScore", "SBERT", "RM"] # , "ROUGE-L"]
print(corr.columns, clean_cols )
corr.index.name = ""
plt.rcParams.update({'font.size': 22})
ax = sns.heatmap(corr,
        xticklabels=clean_cols,
        yticklabels=clean_cols,
            mask=matrix, annot=True, fmt=".0f", vmin=0, vmax=100,
            cmap="coolwarm")
plt.xticks(rotation=45)
plt.yticks(rotation=0)

for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(22)

plt.savefig("../outputs/plots/exp1a_corr.pdf", bbox_inches='tight')
plt.rcParams.update({'font.size': 10})

In [None]:
print(len(df.groupby(["model", "num_samples"]).count()))
exp1a = df[df.target_class.apply(lambda x: x not in excluded)][df.num_samples < 3000].groupby(["target_class"])[target_col].mean().sort_values("gpt-4_lm_score", ascending=False)
exp1a.columns = ["GPT-4", "GPT-3.5", "ROUGE-1", "BERTScore", "SBERT", "RM", "Soft RM", "ROUGE-L"]
exp1a.transpose()

In [None]:
exp1a[["ROUGE-1", "GPT-4", "GPT-3.5", "BERTScore", "SBERT", "RM", "Soft RM"]].transpose()[["logic", "code", "rewrite", "extract", "memoryanswer","write"]].apply(lambda x: round(x,2))

In [None]:
print(exp1a[["ROUGE-1", "GPT-4", "GPT-3.5", "BERTScore", "SBERT", "RM", "Soft RM"]].transpose()[["logic", "code", "rewrite", "extract", "memoryanswer","write"]].apply(lambda x: round(x,2)).to_latex(float_format="%.2f"))

In [None]:
print(exp1a.to_latex(float_format="%.2f"))

# Experiment 1B: Custom datasets

In [None]:
from matplotlib.lines import Line2D
line1 = Line2D([0], [0], label='Exact Match', color='k')
line2 = Line2D([0], [0], label='Custom Score', color='k', linestyle='dashed')
line3 = Line2D([0], [0], label='GPT-4 Score', color='k', linestyle='dotted')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# df2 = pd.read_csv('../outputs/results_standard.csv').sort_values(by=['model', 'num_samples'])
df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets.csv').sort_values(by=['model', 'num_samples'])
# df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_2023-05-15.csv').sort_values(by=['model', 'num_samples'])
# tmp = pd.read_csv('../outputs/results_exp1b_standard_datasets_backup.csv')
# df2 = pd.concat([df2, tmp[tmp.model == "xsum"]]).fillna(0)

df2["target_class"] = df2.model
# df2 = df2[df2.model != "squadv2"]
cols = ["gpt-4_lm_score", "gpt-3.5-turbo_lm_score"]
for col in cols:
    df2[col] = df2[col]/10
# df2.mean_score_pred = df2.mean_score_pred/10
# df2.mean_score_ref = df2.mean_score_ref/10
df2 = df2[df2.num_samples < 5000]
df2 = df2[df2.file.apply(lambda x: "mnli_0_0" not in x )]
df2 = df2[df2.file.apply(lambda x: "mnli_0_1" not in x )]
# df2.loc[df2.model == "conll", "custom_score"] = df2[df2.model == "conll"].apply(lambda x: x.custom_score*0.5  + x.rouge1*0.5, axis=1)
df2.num_samples = df2.num_samples.apply(lambda x: int(x) if x != "only" else 50000)

In [None]:
df2

In [None]:
label2col = {
    "conll": "red",
    "mnli": "blue",
    "sst2": "yellow",
    "qnli": "green",
    "squadv2": "orange",
    # "stsb": "black",
    "xsum": "gray"
}
target_col = ["rouge1",
              "gpt-4_lm_score",
              # "gpt-4_mean_score_ratio",
              "gpt-3.5-turbo_lm_score",
              "bertscore",
              "exact_match",
              "sbertscore",
              "reward_model_score",
              "softmaxed_reward_model_score",
              "custom_score"]

fig, axs = plt.subplots(ncols=3, nrows=2)
fig.tight_layout()

for i, cat in enumerate(label2col.keys()):
    data_df = df2[(df2.model == cat) & (df2.target_class == cat)].sort_values(by="num_samples")
    # data_df = data_df.groupby(["target_class", "num_samples"])[target_col].mean().reset_index()
    sns.lineplot(x="num_samples", y="exact_match",
                 data=data_df, label=cat, color=label2col[cat], ax=axs[i//3][i%3])

    sns.lineplot(x="num_samples", y="custom_score",
                 data=data_df, linestyle='dashed', color=label2col[cat], ax=axs[i//3][i%3])
    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=df2[(df2.model == cat) & (df2.target_class == cat)].sort_values(by="num_samples"), linestyle='dotted', color=label2col[cat], ax=axs[i//3][i%3])

    axs[i//3][i%3].set_xscale('symlog')
    axs[i//3][i%3].set(xlabel=cat, ylabel='')
    axs[i//3][i%3].get_legend().remove()


df2_save = df2.copy()
    # ax2.get_legend().remove()

# where some data has already been plotted to ax
# handles, labels = ax2.get_legend_handles_labels()
# handles.extend([line1, line2])
handles= [line1, line2, line3]
plt.legend(handles=handles, loc='upper left')
# plot the legend
# ax1.set(xlabel='# of training samples', ylabel='Mean Score')


# set x-axis to log scale
plt.xscale('symlog')
plt.ylim((0, 1))
plt.savefig("../outputs/plots/exp1b_1.pdf", bbox_inches='tight')
plt.show()

fig, axs = plt.subplots(ncols=3, nrows=2)
fig.tight_layout()

for i, cat in enumerate(label2col.keys()):

    sns.lineplot(x="num_samples", y="custom_score",
                 data=df2[(df2.model == cat) & (df2.target_class == cat)].sort_values(by="num_samples"), label=cat, linestyle='dashed', color=label2col[cat], ax=axs[i//3][i%3])

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=df2[(df2.model == cat) & (df2.target_class == cat)].sort_values(by="num_samples"), linestyle='dotted', color=label2col[cat], ax=axs[i//3][i%3])
    axs[i//3][i%3].set_xscale('symlog')
    axs[i//3][i%3].set(xlabel=cat, ylabel='')
    axs[i//3][i%3].get_legend().remove()

# ax1.set(xlabel='# of training samples', ylabel='Mean Score')
# handles, labels = ax2.get_legend_handles_labels()
# handles.extend([line3, line2])
# plt.legend(handles=handles, loc='upper left')
handles= [line3, line2]
plt.legend(handles=handles, loc='upper left')

# set x-axis to log scale
plt.ylim((0, 1))
plt.savefig("../outputs/plots/exp1b_2.pdf")
plt.show()

In [None]:
axs.shape

In [None]:
target_col = ["rouge1",
              "gpt-4_lm_score",
              # "gpt-4_mean_score_ratio",
              "gpt-3.5-turbo_lm_score",
              # "gpt-3.5-turbo_mean_score_ratio",
              "bertscore",
              # "exact_match",
              "sbertscore",
              # "reward_model_score",
              "softmaxed_reward_model_score",
              "custom_score"]

clean_target_col = ["ROUGE-1",
              "GPT4",
              # "GPT-4 Ratio",
              "GPT3.5",
              # "GPT-3.5 Ratio",
              "BScore",
              # "EM",
              "SBert",
              # "RM",
              "RM",
              "Human"]
df2[target_col].corr(method="spearman")

In [None]:
df2.sample(10)[["rouge1", "custom_score"]]

In [None]:
# excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum"]
excluded = ["conll", "mnli", "qnli", "sst2", "stsb", "squadv2"]
# excluded = ["conll"]
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded))][target_col].corr(method="spearman")
matrix = np.triu(corr)

display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=corr.columns,
        yticklabels=corr.columns,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

In [None]:
# excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum"]
excluded = ["conll", "mnli", "qnli", "sst2", "stsb", "squadv2"]
# excluded = ["conll"]
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples<100)][target_col].corr(method="spearman").reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.savefig("../outputs/plots/exp1b_metrics_corr_total.pdf")

plt.show()

plt.figure()
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples<100)].groupby("target_class")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.savefig("../outputs/plots/exp1b_metrics_corr_mean.pdf")
plt.show()

plt.figure()

In [None]:
# excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum"]
excluded = ["conll", "mnli", "qnli", "sst2", "stsb", "squadv2"]
# excluded = ["conll"]
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples==0)][target_col].corr(method="spearman").reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")
plt.savefig("../outputs/plots/exp1b_metrics_corr_total.pdf")

plt.show()

plt.figure()
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples==0)].groupby("target_class")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")
plt.xticks(rotation=45)
plt.savefig("../outputs/plots/exp1b_metrics_corr_mean.pdf")
plt.show()

plt.figure()

In [None]:
df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples==0)].groupby("target_class")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].count()

In [None]:
target_col = ["rouge1",
              "gpt-4_lm_score",
              "gpt-3.5-turbo_lm_score",
              "bertscore",
              "sbertscore",
              "reward_model_score",
              "softmaxed_reward_model_score",
              "custom_score"
              ]

excluded = ["sst2", "mnli", "qnli"]

a = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples==0)].groupby("target_class")[target_col].mean()

b = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples==1000)].groupby("target_class")[target_col].mean()

b

In [None]:
ref = (((b-a)/a).mean(axis=0)["custom_score"])

((b-a)/a).mean(axis=0)

In [None]:
# Xsum only
target_col = ["rouge1",
              "gpt-4_lm_score",
              "gpt-4_mean_score_ratio",
              "gpt-3.5-turbo_lm_score",
              "bertscore",
              "sbertscore",
              "reward_model_score",
              "softmaxed_reward_model_score"
              ]# excluded = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum"]
excluded = ["xsum"]
# excluded = ["conll"]
corr = df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples>-1)][target_col].corr(method="spearman").reindex(target_col)[target_col]
matrix = np.triu(corr)

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=corr.columns,
        yticklabels=corr.index,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")
plt.show()& (df2.num_samples<100)

In [None]:
df2[(df2.model.apply(lambda x: x  in excluded)) & (df2.target_class.apply(lambda x: x in excluded)) & (df2.num_samples<50)].groupby("target_class")[target_col].corr(method="pearson").reset_index().groupby("level_1")[target_col].mean()

In [None]:
target_col = ["rouge1",
              "gpt-4_lm_score",
              # "gpt-4_mean_score_ratio",
              "gpt-3.5-turbo_lm_score",
              "bertscore",
              "exact_match",
              "sbertscore",
              "reward_model_score",
              "softmaxed_reward_model_score",
              "custom_score"]

excludeds = ["conll", "mnli", "qnli", "squadv2", "sst2", "stsb", "xsum"]
for excluded in excludeds:
    corr = df2[(df2.model.apply(lambda x: x  == excluded)) & (df2.target_class.apply(lambda x: x == excluded)) & (df2.num_samples==1000)][target_col].corr(method="spearman")
    matrix = np.triu(corr)
    plt.figure(figsize=(10, 10))

    # display(corr)
    print("Excluded:", excluded)
    # plot the heatmap
    sns.heatmap(corr,
            xticklabels=corr.columns,
            yticklabels=corr.columns,
                annot=True, mask=matrix,
                cmap="coolwarm")
    # plt.savefig(f"../outputs/plots/exp1b_3_{excluded}.pdf")
    plt.show()

In [None]:
df2.groupby("target_class")[target_col].mean()

In [None]:
print(df2[df2.target_class=="xsum"]["gpt-3.5-turbo_responses"].iloc[0])

In [None]:
sns.scatterplot(x="custom_score", y="gpt-4_lm_score", data=df2)

In [None]:
df_counts = df2.groupby(['custom_score', 'gpt-4_lm_score']).size().reset_index(name='count')
plt.figure()
sns.scatterplot(df_counts, x='custom_score', y='gpt-4_lm_score', size="count", sizes=(50, 200))

plt.figure()
sns.kdeplot(data=df_counts,  x='custom_score', y='gpt-4_lm_score', fill=True, levels=10)

In [None]:
df2

In [None]:
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from importlib import reload
plt=reload(plt)

# df2 = pd.read_csv('../outputs/results_standard.csv').sort_values(by=['model', 'num_samples'])
df = pd.read_csv('../outputs/results_exp1b_synthetic_datasets.csv').sort_values(by=['model', 'num_samples'])
# df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_2023-05-15.csv').sort_values(by=['model', 'num_samples'])
# tmp = pd.read_csv('../outputs/results_exp1b_standard_datasets_backup.csv')
# df2 = pd.concat([df2, tmp[tmp.model == "xsum"]]).fillna(0)

datasets  = ["sst2", "mnli", "conll"]

def task_mapper(x):
    task = mapper(x)
    task_map = {
        f"{task}": "Manual",
        f"synth{task}": "Synthetic",
        f"synth{task}rand": "Random",
        f"synth{task}bootstrapped": "Bootstrapped",

    }
    return task_map[x]

def mapper(x):
    if "sst2" in x:
        return "sst2"
    if "conll" in x:
        return "conll"
    if "mnli" in x:
        return "mnli"
    print(x)
    raise ValueError

df["model"] = df.task.apply(lambda x: mapper(x))
df["task"]= df.task.apply(lambda x: task_mapper(x))

df["target_class"] = df.model
# df2 = df2[df2.model != "squadv2"]
cols = ["gpt-4_lm_score"]
for col in cols:
    df[col] = df[col]/10
# df2.mean_score_pred = df2.mean_score_pred/10
# df2.mean_score_ref = df2.mean_score_ref/10

fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(6, 3))
fig.tight_layout()


label2col = {
    "Manual": "black",
    "Synthetic": "red",
    "Random": "blue",
    "Bootstrapped": "green",
}

for i, model in enumerate(datasets):
    df2 = df[df.model == model]
    base = df2[(df2.num_samples == 0) & (df2.task != "Bootstrapped")]["gpt-4_lm_score"].mean()
    base_boot = df2[(df2.num_samples == 0) & (df2.task == "Bootstrapped")]["exact_match"].mean()
    yline = 100 if model != "conll" else 50
    print(base, base_boot)

    for task in df2.task.unique():
        tmp_df = df2[df2.task == task]
        # tmp_df["gpt-4_lm_score"] = smooth(list(tmp_df["gpt-4_lm_score"].values), 0.5)

        if task=="Bootstrapped":
            axs[0][i].axhline(y=base, xmin=0.0, xmax=1000, color='black', linestyle="dashdot", alpha=0.3)
            axs[0][i].axvline(x=yline if not "conll" else 50, ymin=0.0, ymax=1.0, color='black', linestyle="dashdot", alpha=0.3)
        # axs[0][i].text(s="qqq", x=0, y= 0.5)

        tmp_df.loc[tmp_df.num_samples == 0, "gpt-4_lm_score"] = base
        axs[0][i] = sns.lineplot(data=tmp_df, x="num_samples", y="gpt-4_lm_score", color=label2col[task], legend=None, errorbar=None, ax=axs[0][i]) #gpt-4_lm_score"
        # ax = sns.lineplot(data=df2, x="num_samples", y="exact_match", hue="task", linestyle="dashed", errorbar=None)

        # y axis
        axs[0][i].set_ylim(0.5, 1)
        axs[0][i].set_xlim(0, 1000)

        axs[0][i].set_xscale('symlog')
        axs[0][i].set(xlabel="", ylabel="GPT4 Score" if i==0 else "")

        if task=="Bootstrapped":
            axs[1][i].axhline(y=base_boot, xmin=0.0, xmax=1000, color='black', linestyle="dashdot", alpha=0.3)
            axs[1][i].axvline(x=yline, ymin=0.0, ymax=1.0, color='black', linestyle="dashdot", alpha=0.3)

        axs[1][i] = sns.lineplot(data=tmp_df, x="num_samples", y="exact_match", color=label2col[task], legend=None, errorbar=None, ax=axs[1][i]) #gpt-4_lm_score"
        # ax = sns.lineplot(data=df2, x="num_samples", y="exact_match", hue="task", linestyle="dashed", errorbar=None)

        # y axis
        axs[1][i].set_xlim(0, 1000)
        axs[1][i].set_ylim(-0.02, 1)

        axs[1][i].set_xscale('symlog')
        axs[1][i].set(xlabel=r'$N$', ylabel="Exact Match" if i==0 else "")
        axs[0][i].set_title(model)

        def font_up(ax):
            for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
                    item.set_fontsize(13)
        font_up(axs[0][i])
        font_up(axs[1][i])


handles, labels = axs[0][0].get_legend_handles_labels()
from matplotlib.lines import Line2D
line1 = Line2D([0], [0], label='H', color='black')
line2 = Line2D([0], [0], label='S', color='red')
line3 = Line2D([0], [0], label='R', color='blue')
line4 = Line2D([0], [0], label='S+H', color='green')
handles.extend([line1, line2, line3, line4])
axs[0][0].legend(fontsize=14, handles=handles, loc='lower left', bbox_to_anchor=(0.2, 1.13), ncol=4, borderaxespad=0, frameon=False)
# Legend font size
# plt.legend(loc='lower left')

plt.savefig("../outputs/plots/sym_exp1.pdf", bbox_inches='tight')

In [None]:
df[df.task == "Manual"][["gpt-4_lm_score", "num_samples", "model"]]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# df2 = pd.read_csv('../outputs/results_standard.csv').sort_values(by=['model', 'num_samples'])
df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_neg.csv').sort_values(by=['model', 'num_samples'])
# df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_2023-05-15.csv').sort_values(by=['model', 'num_samples'])
# tmp = pd.read_csv('../outputs/results_exp1b_standard_datasets_backup.csv')
# df2 = pd.concat([df2, tmp[tmp.model == "xsum"]]).fillna(0)

df2["target_class"] = df2.model
# df2 = df2[df2.model != "squadv2"]
cols = ["gpt-4_lm_score"]
for col in cols:
    df2[col] = df2[col]/10
# df2.mean_score_pred = df2.mean_score_pred/10
# df2.mean_score_ref = df2.mean_score_ref/10
# df2.loc[df2.model == "conll", "custom_score"] = df2[df2.model == "conll"].apply(lambda x: x.custom_score*0.5  + x.rouge1*0.5, axis=1)
df2.num_samples = df2.num_samples.apply(lambda x: int(x))

if True:
    # for task in ["sst2", "conll", "mnli"]:
    # sns.lineplot(data=df2[df2.negtask == task], x="num_samples", y="gpt-4_lm_score", hue="model")
    sns.lineplot(data=df2, x="num_samples", y="gpt-4_lm_score", hue="model")


plt.xscale("symlog")
plt.ylim((0, 1))
plt.savefig("../outputs/plots/sym_exp1_neg.pdf")

In [None]:
df2

In [None]:
tmp = df2[df2.num_samples == 1000].groupby(["model", "num_samples", "negtask"])["gpt-4_lm_score"].mean().reset_index().pivot(index="model", columns="negtask", values="gpt-4_lm_score")
tmp["Base"] = df2[df2.num_samples == 0].groupby(["model"])["gpt-4_lm_score"].mean()
tmp["Average"] = df2[df2.num_samples == 1000].groupby(["model"])["gpt-4_lm_score"].mean()
print(tmp[["Base", "sst2", "conll", "mnli", "Average"]].to_latex(float_format="%.2f"))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# df2 = pd.read_csv('../outputs/results_standard.csv').sort_values(by=['model', 'num_samples'])
df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_all.csv').sort_values(by=['model', 'num_samples'])
# df2 = pd.read_csv('../outputs/results_exp1b_standard_datasets_2023-05-15.csv').sort_values(by=['model', 'num_samples'])
# tmp = pd.read_csv('../outputs/results_exp1b_standard_datasets_backup.csv')
# df2 = pd.concat([df2, tmp[tmp.model == "xsum"]]).fillna(0)

df2["target_class"] = df2.model
# df2 = df2[df2.model != "squadv2"]
cols = ["gpt-4_lm_score"]
for col in cols:
    df2[col] = df2[col]/10
# df2.mean_score_pred = df2.mean_score_pred/10
# df2.mean_score_ref = df2.mean_score_ref/10
df2 = df2[df2.num_samples < 5000]
# df2.loc[df2.model == "conll", "custom_score"] = df2[df2.model == "conll"].apply(lambda x: x.custom_score*0.5  + x.rouge1*0.5, axis=1)
df2.num_samples = df2.num_samples.apply(lambda x: int(x) if x != "only" else 50000)

In [None]:
df2

In [None]:
label2col = {
    "conll": "red",
    "mnli": "blue",
    "sst2": "green",
    # "stsb": "black",
}


model2col = {
    "llama": "red",
    "falcon": "blue",
    "bloom": "green",
    "pythia": "orange",
}
target_col = ["rouge1",
              "gpt-4_lm_score",
              # "gpt-4_mean_score_ratio",
              "exact_match",
              "custom_score"]


fig, axs = plt.subplots(ncols=3, nrows=4, figsize=(6, 6))
fig.tight_layout()

for j, model in enumerate(["pythia", "falcon", "bloom", "llama"]):
    for i, cat in enumerate(label2col.keys()):
        data_df = df2[(df2.task == cat) & (df2.model == model)].sort_values(by="num_samples")
        # data_df = data_df.groupby(["target_class", "num_samples"])[target_col].mean().reset_index()
        ref_h = data_df[(data_df.num_samples == 0)]["gpt-4_lm_score"].mean()

        axs[j][i].axhline(y=ref_h, color='black', linestyle='dashdot', alpha=0.3)
        sns.lineplot(x="num_samples", y="exact_match",
                     data=data_df, label=cat, color=model2col[model], ax=axs[j][i])

        sns.lineplot(x="num_samples", y="custom_score",
                     data=data_df, linestyle='dashed', color=model2col[model], ax=axs[j][i])

        sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                     data=data_df, linestyle='dotted', color=model2col[model], ax=axs[j][i])

        axs[j][i].set_xlim(0, 1000)

        axs[j][i].set_xscale('symlog')
        axs[j][i].set(xlabel=r'$N$' if j==3 else '', ylabel=model.capitalize() if i==0 else '')
        if j==0:
            axs[j][i].set_title(cat)
        axs[j][i].get_legend().remove()

        def font_up(ax):
            for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
                    item.set_fontsize(14)

        font_up(axs[j][i])
        font_up(axs[j][i])

# if False:
# # old llama data
#     label2col = {
#         "conll": "red",
#         "mnli": "blue",
#         "sst2": "green",
#         "qnli": "yellow",
#         "squadv2": "orange",
#         # "stsb": "black",
#         "xsum": "gray"
#     }
#     for i, cat in enumerate(["squadv2", "qnli", "xsum"]):
#
#         ref_h = df2_save[(df2_save.model == cat) & (df2_save.target_class == cat) & (df2_save.num_samples == 0)]["gpt-4_lm_score"].mean()
#
#         axs[4][i%3].axhline(y=ref_h, color='black', linestyle='doshdat')
#
#
#         sns.lineplot(x="num_samples", y="exact_match",
#                      data=df2_save[(df2_save.model == cat) & (df2_save.target_class == cat)].sort_values(by="num_samples"), label=cat, color=label2col[cat], ax=axs[4][i%3])
#
#
#         sns.lineplot(x="num_samples", y="custom_score",
#                      data=df2_save[(df2_save.model == cat) & (df2_save.target_class == cat)].sort_values(by="num_samples"), label=cat, linestyle='dashed', color=label2col[cat], ax=axs[4][i%3])
#
#         sns.lineplot(x="num_samples", y="gpt-4_lm_score",
#                      data=df2_save[(df2_save.model == cat) & (df2_save.target_class == cat)].sort_values(by="num_samples"), linestyle='dotted', color=label2col[cat], ax=axs[4][i%3])
#
#
#
#         axs[4][i%3].set_xlim(0, 1000)
#         axs[4][i%3].set_xscale('symlog')
#         axs[4][i%3].set(xlabel=cat.upper(), ylabel=model.capitalize() if i==0 else '')
#         axs[4][i%3].get_legend().remove()


from matplotlib.lines import Line2D
line1 = Line2D([0], [0], label='Exact Match', color='k')
line2 = Line2D([0], [0], label='Human Score', color='k', linestyle='dashed')
line3 = Line2D([0], [0], label='GPT4 Score', color='k', linestyle='dotted')

handles= [line1, line2, line3]
axs[0][0].legend(fontsize=12, handles=handles, loc='lower left', bbox_to_anchor=(-0.1, 1.26), ncol=3, borderaxespad=0., frameon=False)

    # ax2.get_legend().remove()

# where some data has already been plotted to ax
# handles, labels = ax2.get_legend_handles_labels()
# handles.extend([line1, line2])

# plot the legend
# ax1.set(xlabel='# of training samples', ylabel='Mean Score')


# set x-axis to log scale
plt.xscale('symlog')
plt.ylim((0, 1))
plt.savefig("../outputs/plots/exp1b_all.pdf", bbox_inches="tight")
plt.show()

In [None]:
df2_save[(df2_save.model == cat) & (df2_save.target_class == cat) & (df2_save.num_samples == 0) & (df2_save.num_samples == 0)]["gpt-4_lm_score"].mean()

# Experiment 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("../outputs/results_exp2_ins_diversity.csv")
sns.lineplot(data=df, x="num_bins", y="custom_score", hue="test_data")
plt.xscale("log")
plt.ylim((0, 1))
plt.savefig("../outputs/plots/exp2.pdf")

In [None]:
df.groupby("test_data")["custom_score"].describe()

In [None]:
ax = sns.boxplot(data=df, x="test_data", y="custom_score")
ax.set(xlabel='Task', ylabel='Mean Score')
plt.ylim((0, 0.9))
plt.savefig("../outputs/plots/exp2_boxplot.pdf")

In [None]:
df[["custom_score", "num_bins"]].corr("pearson")

# Exp3

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# SST2 results (0-shot)
PATH = "../outputs/results_exp3_scaling.csv"
df = pd.read_csv(PATH) # .groupby(["num_samples"])["custom_score"].median().reset_index()
# Remove 0.95 quantile and 0.05 quantile for each bin

# Define a function to filter
# the upper and lower 5% of data for each category
def filter_quantiles(group):
    q05 = group['custom_score'].quantile(0.10)
    q95 = group['custom_score'].quantile(0.90)
    return group[(group['custom_score'] >= q05) & (group['custom_score'] <= q95)]

# Apply the function to each group and concatenate the filtered dataframes back together
# df = pd.concat([filter_quantiles(group) for name, group in df.groupby('num_samples')])


sns.barplot(data=df, x="num_samples", y="custom_score", label="custom_score")
# sns.lineplot(data=df, x="num_samples", y="custom_score", label="custom_score")
# plt.xscale("log")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")
plt.ylim((0.6, 0.9))
plt.savefig("../outputs/plots/exp3.pdf")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# TODO: add another category
# plt.figure(figsize=(10,5))
# SST2 results (0-shot)
PATH = "../outputs/results_exp3_scaling.csv"
df = pd.read_csv(PATH) # .groupby(["num_samples"])["custom_score"].median().reset_index()
df["gpt-4_lm_score"] /= 10
# Remove 0.95 quantile and 0.05 quantile for each bin
# df = df[df.task == "sst2"]
# Define a function to filter
# the upper and lower 5% of data for each category
def filter_quantiles(group):
    q05 = group['custom_score'].quantile(0.10)
    q95 = group['custom_score'].quantile(0.90)
    return group[(group['custom_score'] >= q05) & (group['custom_score'] <= q95)]

# Apply the function to each group and concatenate the filtered dataframes back together
# df = pd.concat([filter_quantiles(group) for name, group in df.groupby('num_samples')])
df["score"] = df.apply(lambda x: x["custom_score"] if x["custom_score"] != -1 else x["gpt-4_lm_score"], axis=1)
df = df[~df.num_samples.isin([1000, 5000, 10000])] # .groupby("num_samples").tail(5).reset_index()
df = df[~df.task.isin(["xsum"])]#.groupby(["task", "num_samples"]).tail(5).reset_index()

ax = sns.boxplot(data=df, x="num_samples", y="score", hue="task", width=0.5)
ax.set(xlabel='# of training samples', ylabel='Mean Score')
# sns.lineplot(data=df, x="num_samples", y="custom_score", label="custom_score")
# plt.xscale("log")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")
plt.ylim((0.6, 1))
plt.savefig("../outputs/plots/exp3.pdf")


In [None]:
df[~df.task.isin(["xsum"])].groupby(["task", "num_samples"]).count()

In [None]:
df.groupby("num_samples").count()

In [None]:
df.groupby(["num_samples"])["custom_score"].count().reset_index()

In [None]:
df[df.num_samples == 1000]

# Exp 4

In [None]:
from typing import List

def smooth(scalars: List[float], weight: float) -> List[float]:  # Weight between 0 and 1
    last = scalars[0]  # First value in the plot (first timestep)
    smoothed = list()
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
        smoothed.append(smoothed_val)                        # Save it
        last = smoothed_val                                  # Anchor the last smoothed value

    return smoothed

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

PATH = "../outputs/results_exp4_ift_noift.csv"
df = pd.read_csv(PATH)
# df["gpt-4_lm_score"] = df["gpt-4_lm_score"]/10
df = df[df.task == "sst2"]
df["gpt-4_lm_score"] = df["gpt-4_lm_score"]/10
df.model = df.model.apply(lambda x: x[1:])
# sns.lineplot(data=df[df.ift == "ift"], x="num_samples", y="custom_score",  hue="model")
# sns.lineplot(data=df[df.ift == "no_ift"], x="num_samples", y="custom_score",  hue="model", linestyle="dashed")
sns.lineplot(data=df[df.ift == "ift"], x="num_samples", y="gpt-4_lm_score",  hue="model")
sns.lineplot(data=df[df.ift == "no_ift"], x="num_samples", y="gpt-4_lm_score",  hue="model", linestyle="dashed")
plt.xscale("symlog")
plt.show()


label2col = {
"llama": "red",
"falcon": "blue",
"bloom": "green",
"pythia": "orange",
"opt": "yellow",
# "stsb": "black",
"xsum": "gray"
}


fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(4.5, 3))
fig.tight_layout()

for i, cat in enumerate(df.model.unique()):

    ref_h = df[(df.model == cat) & (df.ift == "ift") &  (df.num_samples == 0)]["gpt-4_lm_score"].mean()

    axs[i//2][i%2].axhline(y=ref_h, color='black', linestyle='dashdot', alpha=0.3)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=df[(df.model == cat) & (df.ift == "ift")].sort_values(by="num_samples"), label=cat, color=label2col[cat], linestyle='solid', errorbar=None, ax=axs[i//2][i%2])


    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=df[(df.model == cat) & (df.ift == "iftr")].sort_values(by="num_samples"), label=cat, color=label2col[cat], linestyle='dotted', errorbar=None, ax=axs[i//2][i%2])

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=df[(df.model == cat) & (df.ift == "no_ift")].sort_values(by="num_samples"), label=cat, color=label2col[cat], linestyle='dashed', errorbar=None, ax=axs[i//2][i%2])
    axs[i//2][i%2].set_xscale('symlog')
    # axs[i//2][i%2].set_xlim((0, 100))
    axs[i//2][i%2].set(xlabel=cat.capitalize(), ylabel='GPT-4 Score' if i%2 == 0 else '')
    axs[i//2][i%2].get_legend().remove()

line1 = Line2D([0], [0], label='Alpaca', color='k')
line2 = Line2D([0], [0], label='Base', color='k', linestyle='dashed')
line3 = Line2D([0], [0], label='Shelf', color='k', linestyle='dotted')

handles= [line1, line2, line3]
axs[0][0].legend(handles=handles, loc='lower right', bbox_to_anchor=(1.0, 0.0), frameon=False)
# sns.lineplot(data=df, x="num_samples", y="gpt-4_lm_score",  hue="model", linestyle="dashed")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")

plt.savefig("../outputs/plots/exp4.pdf", bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(4.5, 3))
fig.tight_layout()

smoothing_factor = 0.1
for i, cat in enumerate(df.model.unique()):

    tdf = df[(df.model == cat) & (df.ift == "ift")].groupby("num_samples")["gpt-4_lm_score"].median().reset_index().sort_values(by="num_samples")
    tdf["gpt-4_lm_score"] = smooth(tdf["gpt-4_lm_score"],smoothing_factor)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=tdf, label=cat, color=label2col[cat], linestyle='solid', errorbar=None, ax=axs[i//2][i%2])

    tdf = df[(df.model == cat) & (df.ift == "iftr")].groupby("num_samples")["gpt-4_lm_score"].median().reset_index().sort_values(by="num_samples")
    tdf["gpt-4_lm_score"] = smooth(tdf["gpt-4_lm_score"], smoothing_factor)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=tdf, label=cat, color=label2col[cat], linestyle='dashdot', errorbar=None, ax=axs[i//2][i%2])

    tdf = df[(df.model == cat) & (df.ift == "no_ift")].groupby("num_samples")["gpt-4_lm_score"].median().reset_index().sort_values(by="num_samples")
    tdf["gpt-4_lm_score"] = smooth(tdf["gpt-4_lm_score"], smoothing_factor)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=tdf, label=cat, color=label2col[cat], linestyle='dashed', errorbar=None, ax=axs[i//2][i%2])


    axs[i//2][i%2].set_xscale('symlog')
    # axs[i//2][i%2].set_ylim(0, 1)
    # axs[i//2][i%2].set_xlim((0, 100))
    axs[i//2][i%2].set(xlabel=cat.capitalize(), ylabel='')
    axs[i//2][i%2].get_legend().remove()


line1 = Line2D([0], [0], label='Alpaca', color='k')
line2 = Line2D([0], [0], label='Base', color='k', linestyle='dashed')
line3 = Line2D([0], [0], label='Shelf', color='k', linestyle='dotted')

handles= [line1, line2, line3]
axs[0][0].legend(handles=handles, loc='lower right', bbox_to_anchor=(1.0, 0.0), frameon=False)

plt.savefig("../outputs/plots/exp4_all_old.pdf", bbox_inches='tight')



fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(4.5, 3))
fig.tight_layout()

smoothing_factor = 0.1
for i, cat in enumerate(df.model.unique()):

    tdf = df[(df.model == cat) & (df.ift == "ift")].groupby("num_samples")["gpt-4_lm_score"].median().reset_index().sort_values(by="num_samples")
    tdf["gpt-4_lm_score"] = smooth(tdf["gpt-4_lm_score"],smoothing_factor)

    ref_h = tdf[tdf["num_samples"] == 0]["gpt-4_lm_score"].values[0]
    axs[i//2][i%2].axhline(y=ref_h, color='black', linestyle='dashdot', alpha=0.3)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=tdf, label=cat, color=label2col[cat], linestyle='solid', errorbar=None, ax=axs[i//2][i%2])

    tdf = df[(df.model == cat) & (df.ift == "no_ift")].groupby("num_samples")["gpt-4_lm_score"].median().reset_index().sort_values(by="num_samples")
    tdf["gpt-4_lm_score"] = smooth(tdf["gpt-4_lm_score"], smoothing_factor)

    sns.lineplot(x="num_samples", y="gpt-4_lm_score",
                 data=tdf, label=cat, color=label2col[cat], linestyle='dashed', errorbar=None, ax=axs[i//2][i%2])


    axs[i//2][i%2].set_xscale('symlog')
    # axs[i//2][i%2].set_ylim(0, 1)
    # axs[i//2][i%2].set_xlim((0, 100))
    # .set(xlabel=cat.capitalize(), ylabel='GPT-4 Score' if i%2 == 0 else '')
    axs[i//2][i%2].get_legend().remove()
    axs[i//2][i%2].set(xlabel=r'$N$' if i//2==1 else '', ylabel="" if i==0 else '')

    axs[i//2][i%2].set_title(cat.capitalize())

    def font_up(ax):
            for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
                    item.set_fontsize(12)

    font_up(axs[i//2][i%2])

o = fig.text(-0.01, 0.5, 'GPT4 Score', va='center', rotation='vertical')
o.set_fontsize(12)
line1 = Line2D([0], [0], label='Alpaca', color='k')
line2 = Line2D([0], [0], label='Base Model', color='k', linestyle='dashed')

handles= [line1, line2]
# axs[0][0].legend(handles=handles, loc='lower right', bbox_to_anchor=(1.0, 0.0), frameon=False)
axs[0][0].legend(fontsize=12, handles=handles, loc='lower left', bbox_to_anchor=(0.25, 1.13), ncol=2, borderaxespad=0., frameon=False)

plt.savefig("../outputs/plots/exp4_all.pdf", bbox_inches='tight')

In [None]:
tdf[tdf["num_samples"] == 0]["gpt-4_lm_score"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

PATH = "../outputs/results_exp4_alpaca_llama.csv"
df = pd.read_csv(PATH)
# df["gpt-4_lm_score"] = df["gpt-4_lm_score"]/10
df = df[df.task == "sst2"]
df.model = df.model.apply(lambda x: x[1:])
sns.lineplot(data=df, x="num_samples", y="custom_score",  hue="model")
# sns.lineplot(data=df, x="num_samples", y="gpt-4_lm_score",  hue="model", linestyle="dashed")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")

plt.xscale("log")
plt.savefig("../outputs/plots/exp4.pdf")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

PATH = "../outputs/results_exp4_alpaca_llama.csv"
df = pd.read_csv(PATH)
# df["gpt-4_lm_score"] = df["gpt-4_lm_score"]/10
df = df[df.task == "squadv2"]
df.model = df.model.apply(lambda x: x[1:])
sns.lineplot(data=df, x="num_samples", y="custom_score",  hue="model")
# sns.lineplot(data=df, x="num_samples", y="gpt-4_lm_score",  hue="model", linestyle="dashed")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")

plt.xscale("log")
plt.savefig("../outputs/plots/exp4.pdf")

In [None]:
PATH = "../outputs/results_exp4_alpaca_llama_backup.csv"
df = pd.read_csv(PATH)
df["gpt-4_lm_score"] = df["gpt-4_lm_score"]/10
df = df[df.task == "xsum"]
df.model = df.model.apply(lambda x: x[1:])
sns.lineplot(data=df, x="num_samples", y="rouge1",  hue="model")
sns.lineplot(data=df, x="num_samples", y="gpt-4_lm_score",  hue="model", linestyle="dashed")
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")

plt.xscale("log")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

PATH = "../outputs/results_exp4_alpaca_llama.csv"
df = pd.read_csv(PATH)
df.model = df.model.apply(lambda x: x[1:].capitalize())
ax = sns.boxplot(data=df, x="num_samples", y="custom_score",  hue="model")
# plt.legend(handles=handles, loc='upper left')
ax.set(xlabel='# of training samples', ylabel='Mean Score')
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")
plt.savefig("../outputs/plots/exp4.pdf")

# plt.xscale("log")

In [None]:
df.groupby(["num_samples", "model"]).count()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

PATH = "../outputs/results_exp4_alpaca_llama.csv"
df = pd.read_csv(PATH)
df.model = df.model.apply(lambda x: x[1:].capitalize())
fig, axs = plt.subplots(ncols=3, figsize=(15, 3))
fig.tight_layout()

for i, cat in enumerate(["sst2", "xsum", "squadv2"]):
    df2 = df[df.task == cat]
    ax = sns.boxplot(data=df2, x="num_samples", y="custom_score",  hue="model", ax=axs[i], width=0.6)
    axs[i].set(xlabel=cat, ylabel='')
    axs[i].get_legend().remove()


# plt.legend(handles=handles, loc='upper left')
# ax.set(xlabel='# of training samples', ylabel='Mean Score')
# sns.lineplot(data=df, x="num_samples", y="exact_match", label="exact_match")
plt.savefig("../outputs/plots/exp4.pdf")

# Exp 0-a

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

PATH = "../outputs/results_exp0_sum.csv"
df = pd.read_csv(PATH)
df

In [None]:
target_col = ["rouge1",
              "gpt-4_lm_score",
              # "gpt-4_mean_score_ratio",
              # "gpt-3.5-turbo_lm_score",
              "bertscore",
              "sbertscore",
              # "reward_model_score",
              # "softmaxed_reward_model_score",
              "litepyramid_recall"]

clean_target_col = ["ROUGE-1",
              "GPT-4",
              # "gpt-4_mean_score_ratio",
              # "GPT-3.5",
              "BERTScore",
              "SBert",
              # "RM",
              # "Soft RM",
              "Human"]

In [None]:
corr = df[target_col].corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
k = 15
df.iloc[k].model_summary

In [None]:
df.iloc[k][target_col]

In [None]:
sns.scatterplot(df, x='litepyramid_recall', y='gpt-4_lm_score')

In [None]:
df.groupby("model").describe()[target_col]

In [None]:
sample = df[df.litepyramid_recall > 0.8].sample(1).iloc[0]
display(sample[target_col])
print(sample["model_summary"])
print(sample["ref_summary"])
print("-----------")
print(sample["source"])

# Exp 0B

In [None]:
target_col = ["rouge1",
              "rougeL",
              "gpt-4_lm_score",
              "gpt-4_mean_score_ratio",
              "gpt-3.5-turbo_lm_score",
              "gpt-3.5-turbo_mean_score_ratio",
              "bertscore",
              "sbertscore",
              "reward_model_score",
              "softmaxed_reward_model_score",
              "human_score"]

clean_target_col = ["ROUGE-1", "ROUGE-L",
              "GPT-4",
              "GPT-4 Ratio",
              "GPT-3.5",
              "GPT-3.5 Ratio",
              "BERTScore",
              "SBert",
              "RM",
              "Soft RM",
              "Human"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

PATH = "../outputs/results_exp0b_sum.csv"
df = pd.read_csv(PATH)

In [None]:
df

In [None]:
df_counts = df.groupby(['human_score', 'gpt-4_lm_score']).size().reset_index(name='count')
plt.figure()
sns.scatterplot(df_counts, x='human_score', y='gpt-4_lm_score', size="count", sizes=(50, 200))

plt.figure()
sns.kdeplot(data=df_counts,  x='human_score', y='gpt-4_lm_score', fill=True, levels=10)


In [None]:
corr = df[target_col].corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
import random

policies = list(df.policy.unique())
# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
l = []
for i in range(10000):
    l.append(df.sample(10, replace=False)[target_col].corr("spearman"))

corr = pd.concat(l)
corr.index.name = "scorer"
corr = corr.groupby("scorer").mean().reindex(target_col)[target_col]

matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col)
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
            xticklabels=clean_target_col,
            yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
s0 = pd.DataFrame()
s0["s1"] = corr[["human_score"]]
s0

In [None]:
import random

policies = list(df.policy.unique())
# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
l = []
for i in range(1000):
    pol = random.choice(policies)
    l.append(df[df.policy == pol].sample(10, replace=True)[target_col].corr("spearman"))

corr = pd.concat(l)
corr.index.name = "scorer"
corr = corr.groupby("scorer").mean().reindex(target_col)[target_col]

matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col)
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
            xticklabels=clean_target_col,
            yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
s0["s3"] = corr[["human_score"]]

In [None]:
corr = df.groupby("policy")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col)
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
corr = df.groupby("id")[target_col].mean().corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
corr = df.groupby("policy")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
            xticklabels=clean_target_col,
            yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
corr = df.groupby("id")[target_col].corr(method="spearman").reset_index().groupby("level_1")[target_col].mean().reindex(target_col)[target_col]
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
s0["s2"] = corr[[ "human_score"]]
s0

In [None]:
import random

# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
l = []
for i in range(100):
    df["random_group"] = df.apply(lambda x: f"{x.policy}_{random.randint(1, 100//10)}", axis=1)

    # corr = pd.concat((corr, df.groupby("random_group")[target_col].mean()))
    # print(len(corr))
    l.append(df.groupby("random_group")[target_col].mean().corr(method="spearman"))

corr = pd.concat(l)
corr.index.name = "scorer"
corr = corr.groupby("scorer").mean().reindex(target_col)[target_col]
corr = corr.corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
import random
policies = list(df.policy.unique())
# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
corr = []
for i in range(1000):
    # df["random_group"] = df.apply(lambda x: f"{x.policy}_{random.randint(1, 100//10)}", axis=1)
    # df["random_group"] = df.apply(lambda x: f"_{x.id[-2:]}", axis=1)
    # corr.append(df[df.policy == policies[random.randint(0, len(policies) - 1)]].sample(replace=True, n=20)[target_col].mean())
    corr.append(df.sample(replace=True, n=10)[target_col].mean())
corr = pd.DataFrame(corr)
corr = corr.corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
policies

In [None]:
pd.DataFrame(corr)

In [None]:
s0

In [None]:
# df["random_group"] = df.apply(lambda x: f"_{x.id[-2:]}", axis=1)
corr = df.groupby("policy")[target_col].mean().corr(method="spearman")
matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
import random
policies = list(df.policy.unique())
# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
l = []
for i in range(100):
    corr = []
    sampled_docs = random.choices(df.id.unique(), k=100)
    tmp_df = pd.DataFrame()
    for doc in sampled_docs:
        tmp_df = pd.concat((tmp_df, df[df.id.isin([doc])]))
    for j in range(4):
        corr.append(tmp_df[tmp_df.policy == policies[j]][target_col].mean())
    l.append(pd.DataFrame(corr).corr(method="spearman"))
corr = pd.concat(l)
corr.index.name = "scorer"
corr = corr.groupby("scorer").mean().reindex(target_col)[target_col]

matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
import random
policies = list(df.policy.unique())
# df["random_group"] = df.apply(lambda x: f"{x.policy}_{x.id[-1]}", axis=1)
l = [[], [], [], []]
for i in range(1000):
    corr = []
    sampled_docs = random.choices(df.id.unique(), k=100)
    tmp_df = pd.DataFrame()
    for doc in sampled_docs:
        tmp_df = pd.concat((tmp_df, df[df.id.isin([doc])]))
    for j in range(4):
        l[j].append((tmp_df[tmp_df.policy == policies[j]][target_col].mean()))

o = [pd.DataFrame(l[j]).corr(method="spearman") for j in range(4)]
corr = pd.concat(o)
corr.index.name = "scorer"
corr = corr.groupby("scorer").mean().reindex(target_col)[target_col]

matrix = np.triu(corr, k=1)
# plot the heatmap
assert (corr.columns == corr.index).all()
print(corr.columns, clean_target_col )
corr.index.name = ""

# display(corr)
# plot the heatmap
sns.heatmap(corr,
        xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, mask=matrix, vmin=0, vmax=1,
            cmap="coolwarm")

plt.show()

In [None]:
corr

In [None]:
from datasets import load_dataset

df = load_dataset("openai/summarize_from_feedback", "axis")["test"].to_pandas()

# For quicker results
# For quicker results
df["human_score"] = df["summary"].apply(lambda x: x["axes"]["overall"])
df["source"] = df["info"].apply(lambda x: x["article"])
df["title"] = df["info"].apply(lambda x: x["title"])
df["id"] = df["info"].apply(lambda x: x["id"])
df["model_summary"] = df["summary"].apply(lambda x: x["text"])
df["policy"] = df["summary"].apply(lambda x: x["policy"])
df = df.groupby(["id", "policy", "worker"]).first().reset_index()


In [None]:
df.groupby(["id", "policy"]).count().sort_values(by="info", ascending=False).head(368)

In [None]:
from itertools import product
workers = df.worker.unique()
l = []
for w1, w2 in product(workers, workers):
    if w1 != w2:
        tmp_df = df[df.worker.isin([w1, w2])].copy()
        tmp_df["polid"] = tmp_df.policy + tmp_df.id
        a = tmp_df.groupby((["id", "policy"])).count().reset_index()
        a = a[a["info"]>1]
        a["polid"] = a.policy + a.id
        b = tmp_df[tmp_df.apply(lambda x: (x.policy + x.id) in list(a.polid), axis=1)]
        b = b.groupby(["id", "policy", "worker"]).first().reset_index()
        b1 = b[b.worker == w1][["human_score", "polid"]]
        b2 = b[b.worker == w2][["human_score", "polid"]]
        b = pd.merge(b1, b2, on="polid", how="inner").drop("polid", axis=1)
        # print(len(b))
        if len(b) >= 10:
            # display(b)
            l.append(b.sample(10).corr("spearman"))

tmp_df = pd.concat(l)
tmp_df.index.name = "scorer"
tmp_df = tmp_df.groupby("scorer").mean()
tmp_df

In [None]:
from itertools import product

workers = df.worker.unique()
tmp_df2 = []
for w1, w2 in product(workers, workers):
    if w1 != w2:
        tmp_df = df[df.worker.isin([w1, w2])].copy()
        tmp_df["polid"] = tmp_df.policy + tmp_df.id
        a = tmp_df.groupby((["id", "policy"])).count().reset_index()
        a = a[a["info"] > 1]
        a["polid"] = a.policy + a.id
        b = tmp_df[tmp_df.apply(lambda x: (x.policy + x.id) in list(a.polid), axis=1)]
        b = b.groupby(["id", "policy", "worker"]).first().reset_index()
        b1 = b[b.worker == w1][["human_score", "polid"]]
        b2 = b[b.worker == w2][["human_score", "polid"]]
        b = pd.merge(b1, b2, on="polid", how="inner").drop("polid", axis=1)
        # print(len(b))
        for i in range(100):
            if len(b) >= 10:
                b = b.sample(10).mean()
                tmp_df2.append([b.human_score_x, b.human_score_y])

tmp_df2 = pd.DataFrame(tmp_df2)
tmp_df2.index.name = "scorer"
tmp_df = tmp_df2.corr("spearman")
tmp_df

In [None]:
from itertools import product
workers = df.worker.unique()
l = []
df = df[df.policy.isin(policies)]

for w1, w2 in product(workers, workers):
    if w1 != w2:
        tmp_df = df[df.worker.isin([w1, w2])].copy()
        tmp_df["polid"] = tmp_df.policy + tmp_df.id
        a = tmp_df.groupby((["id", "policy"])).count().reset_index()
        a = a[a["info"]>1]
        a["polid"] = a.policy + a.id
        b = tmp_df[tmp_df.apply(lambda x: (x.policy + x.id) in list(a.polid), axis=1)]
        b = b.groupby(["id", "policy", "worker"]).first().reset_index()
        b1 = b[b.worker == w1][["human_score", "polid"]]
        b2 = b[b.worker == w2][["human_score", "polid"]]
        b = pd.merge(b1, b2, on="polid", how="inner").drop("polid", axis=1)
        # print(len(b))
        if len(b) >= 10:
            # display(b)
            for _ in range(100):
                l.append(b.sample(10).corr("spearman"))

tmp_df = pd.concat(l)
tmp_df.index.name = "scorer"
tmp_df = tmp_df.groupby("scorer").mean()
tmp_df

In [None]:
human_score = tmp_df.iloc[0].human_score_y

In [None]:
df

In [None]:
s0[["s1", "s2", "s3"]]

In [None]:
s0

In [None]:
s0 = s0[["s1", "s2", "s3"]]
# s0.index = clean_target_col
s0.index.name = ""
s0.loc["human_score", :] = [human_score, 0, 0]
mask = np.array([[False, False, False]] *9 +  [[False, True, True]])

# display(corr)
# plot the heatmap
plt.figure()

ax = sns.heatmap(s0,
        # xticklabels=clean_target_col,
        yticklabels=clean_target_col,
            annot=True, vmin=0.0, vmax=0.6, mask=mask, cbar=False,
            cmap="coolwarm")
plt.tight_layout()
plt.savefig("../outputs/plots/exp0b.pdf")
plt.show()

In [None]:
s0.index = clean_target_col
s0.index.name = ""
s0.loc["Human", :] = [human_score, "-", "-"]
# Print with 2 decimals printed in latex

print(s0.round(2).to_latex(float_format='%.2f'))

In [None]:
²