In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import matplotlib.pylab as pylab
_DEFAULT_PARAMS = {'legend.fontsize': '16',
          'figure.figsize': (8, 5),
         'axes.labelsize': '16',
         'axes.titlesize':'16',
         'xtick.labelsize':'16',
         'ytick.labelsize':'16'}
pylab.rcParams.update(_DEFAULT_PARAMS)

sns.set_style("whitegrid")
sns.set_context("paper")

In [None]:
res_dir = "results/q2_2"

In [None]:
sorted(os.listdir(res_dir))

In [None]:
csv_paths = []
for par_dir, dirnames, _ in os.walk(res_dir):
    for sub_dir in dirnames:
        for dirpath, _, filenames in os.walk(os.path.join(par_dir, sub_dir)):
            if "results.csv" in filenames:
                csv_paths.append(os.path.join(dirpath, "results.csv"))

In [None]:
csv_paths = list(set(csv_paths))
csv_paths

In [None]:
main_df = pd.DataFrame()

for csv_file in csv_paths:
    df = pd.read_csv(csv_file, sep=",")
    main_df = pd.concat([main_df, df], ignore_index=True)

In [None]:
len(csv_paths)  # 230823: 3 runs x 4 num_shots x 3 models

In [None]:
print(len(main_df))
main_df.info()

In [None]:
main_df.groupby(["model", "num_shots", "n_possible_completions"])["recall_compl"].mean()

In [None]:
main_df.groupby(["model", "num_shots", ])["precision_compl"].mean()

In [None]:
df = main_df[main_df["model"] != "davinci"]

pivot_df = df.pivot_table(
    index=["num_shots", "model"],
    values=[
        "precision_compl", "recall_compl",
        "precision_expl", "recall_expl",
        ],
    aggfunc={
        "precision_compl": ["mean", "std"],
        "recall_compl": ["mean", "std"],
        "precision_expl": ["mean", "std"],
        "recall_expl": ["mean", "std"],
    })

In [None]:
pivot_df

In [None]:
pivot_df.plot()

In [None]:
ungrouped_df = pivot_df.reset_index()
ungrouped_df.head()

In [None]:
pivot_df.to_csv(os.path.join(res_dir, "1018_agg3runs.csv"), index=False, header=True)

In [None]:
pivot_df.head()

In [None]:
ungrouped_df = pivot_df.reset_index()
ungrouped_df.head()

In [None]:
ungrouped_df.columns

In [None]:
ungrouped_df.reset_index()
ungrouped_df.columns = ["num_shots", "model", "precision_completion_mean", "precision_completion_std", "recall_completion_mean", "recall_completion_std",
                        "precision_explanation_mean", "precision_explanation_std", "recall_explanation_mean", "recall_explanation_std"]

In [None]:
ungrouped_df["model"].unique()

In [None]:
# replace full model names with simple names
ungrouped_df["model"] = ungrouped_df["model"].replace({"gpt-3.5-turbo-0301": "gpt-3.5-turbo", "gpt-4-0314": "gpt-4"})

In [None]:
# Reshape the DataFrame using melt

# Define the value_vars for each set of columns
value_vars = ["precision_completion_mean", "precision_completion_std", "recall_completion_mean", "recall_completion_std",
                        "precision_explanation_mean", "precision_explanation_std", "recall_explanation_mean", "recall_explanation_std"]

melted_df = pd.melt(ungrouped_df, id_vars=['num_shots', 'model'], value_vars=value_vars, var_name='response_type_tmp', value_name='value')

In [None]:
melted_df["response_type_tmp"].unique()

In [None]:
# Extract the response type and metric type from the variable name
melted_df['response_type'] = melted_df['response_type_tmp'].str.split('_').str[1]
melted_df['metric_type'] = melted_df['response_type_tmp'].str.split('_').str[0]
melted_df['stat'] = melted_df['response_type_tmp'].str.split('_').str[2]

# Drop the unnecessary columns
melted_df.drop(columns=['response_type_tmp'], inplace=True)

# Reorder columns
new_columns_order = ['num_shots', 'model', 'response_type', 'metric_type', "stat", "value"]
melted_df = melted_df[new_columns_order]

In [None]:
melted_df.head()

In [None]:
df_mean = melted_df[melted_df["stat"] == "mean"].drop("stat", axis=1)
df_std = melted_df[melted_df["stat"] == "std"].drop("stat", axis=1)

In [None]:
df_mean.head()

In [None]:
df = df_mean[df_mean["metric_type"] == "recall"]

In [None]:
df.head()

In [None]:
df[df["value"] < 0.1]

In [None]:
df.groupby(["num_shots", "model", "response_type"])["value"].mean()

In [None]:
## LINEPLOT
metric = "precision"
df = df_mean[df_mean["metric_type"] == metric]

plt.figure(figsize=(8, 5))

# Define the order of hues (class labels) you want
hue_order = ['text-davinci-003', 'gpt-3.5-turbo', 'gpt-4']

# Define a custom color palette for specific labels
custom_palette = {
    'text-davinci-003': "blue",
    'gpt-3.5-turbo': "green",
    'gpt-4': "orange",

}

sns.lineplot(data=df, x='num_shots',
             y="value",  # [df["metric_type"] == metric]
             hue='model',
             style="response_type",
             palette=custom_palette,
             marker="o",
             )

# sns.lineplot(data=df, x='num_shots',
#             y="value",
#              hue='model',
#              style="response_type",
#              palette=custom_palette,
#              marker="x",
#              dashes=True, )

# Add labels and title
plt.xlabel('Number of Shots')
plt.xticks(df["num_shots"].unique())
plt.ylabel('Score')
plt.title('Precision over Verbalized Answers')
plt.tight_layout()

In [None]:
df_mean = melted_df[melted_df["stat"] == "mean"].drop("stat", axis=1)
df_std = melted_df[melted_df["stat"] == "std"].drop("stat", axis=1)

In [None]:
df = df_mean[df_mean["metric_type"] == "recall"]
df.head()

In [None]:
import matplotlib.pylab as pylab

sns.set_style("whitegrid")
sns.set_context("paper")

# create subplots
# plot prec & recall on different axis
# ship it!
fontsize=16
params = {
    'legend.fontsize': fontsize,
    "legend.title_fontsize": "16",
          'figure.figsize': (8, 10),
         'axes.labelsize': '16',
         'axes.titlesize':'16',
         'xtick.labelsize':'16',
         'ytick.labelsize':'16'}
pylab.rcParams.update(params)

fig, axes = plt.subplots(nrows=2, ncols=1)



df = df_mean
hue_order = ['text-davinci-003', 'gpt-3.5-turbo', "gpt-4"]

# Define a custom color palette for specific labels
custom_palette = {
    'text-davinci-003': "tab:green",
    'gpt-3.5-turbo': "tab:blue",
    "gpt-4": "tab:orange",

}

for metric, ax in zip(["precision", "recall"], axes):

    # Calculate normalized histograms for each class using Seaborn
    if metric == "recall":
        legend=True
    else:
        legend=False

    sns.lineplot(data=df, x='num_shots',
                y=df[df["metric_type"] == metric]["value"],
                hue='model',
                style="response_type",
                palette=custom_palette,
                marker="o",
                ax=ax,
                legend=legend,
                )
    ax.title.set_text(metric) # , fontdict={'fontsize': '16'}
    #ax.title.fontsize = fontsize
    #ax.xtick.labelsize = fontsize
    ax.set_xticks(df["num_shots"].unique())
    ax.set_xlabel("Number of Shots", fontdict={"fontsize": 16})
    ax.set_ylabel("Score", fontdict={"fontsize": 16})

    if legend:
        #leg = ax.legend(fontsize=16, ncol=2, bbox_to_anchor=(1.3, -.1))
        pass

st = fig.suptitle("Precision & Recall of Verbalized Alternatives by Model", fontsize=16)

plt.tight_layout()
fig.savefig("1018_verbalize_precision.pdf", format='pdf', bbox_inches='tight')

In [None]:
conditions = (main_df["model"] != "davinci")
main_df[conditions]["sequence"].value_counts().sum() / 2

In [None]:
main_df.columns

In [None]:
main_df[conditions]["n_possible_completions"].value_counts()

In [None]:
main_df[conditions]["n_possible_explanations"].value_counts() # divide by n_runs = 2