In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
res_dir = "./results/q2_1/"

In [None]:
sorted(os.listdir(res_dir))

In [None]:
csv_paths = []
for par_dir, dirnames, _ in os.walk(res_dir):
    for sub_dir in dirnames:
        for dirpath, _, filenames in os.walk(os.path.join(par_dir, sub_dir)):
            if "results.csv" in filenames:
                csv_paths.append(os.path.join(dirpath, "results.csv"))

In [None]:
csv_paths = list(set(csv_paths))
csv_paths

In [None]:
main_df = pd.DataFrame()

for csv_file in csv_paths:
    df = pd.read_csv(csv_file, sep=",")
    main_df = pd.concat([main_df, df], ignore_index=True)

In [None]:
len(csv_paths)

In [None]:
print(len(main_df))
main_df.info()

In [None]:
len(main_df[main_df["invalid_fn_type"] == "random"])

In [None]:
main_df.groupby(["num_shots", "invalid_fn_type"])["test_passing_completion"].count()

In [None]:
main_df.groupby(["invalid_fn_type"])["num_invalid"].count()

In [None]:
pivot_df = main_df.pivot_table(
    index=["num_shots", "invalid_fn_type"], 
    values=[
        "test_passing_completion", "test_passing_explanation", 
        "org_func",
        ], 
    aggfunc={
        "test_passing_completion": "sum",
        "test_passing_explanation": "sum",
        "org_func": "count",
    })

In [None]:
pivot_df

In [None]:
final_df = pivot_df.copy()
final_df["test_passing_completion"] = final_df["test_passing_completion"] / final_df["org_func"]
final_df["test_passing_explanation"] = final_df["test_passing_explanation"] / final_df["org_func"]
final_df["n_examples"] = final_df["org_func"]
final_df = final_df.drop(columns=["org_func"])

In [None]:
final_df

In [None]:
final_df.to_csv("./results/q2_1/0711_q2_1_agg_ns4,6,8,10.csv")

In [None]:
ungrouped_df = final_df.reset_index()
ungrouped_df.head()

In [None]:
# Create a figure and a set of subplots
fig, axis = plt.subplots(1, ncols=2, figsize=(12, 6), sharey="row")

cols = ["test_passing_completion", "test_passing_explanation"]

# The amount of space for each group of bars along the x-axis
width = 0.2

# The x locations for the groups
x = np.arange(len(ungrouped_df['num_shots'].unique()))

for idx, col in enumerate(cols):
    ax = axis[idx]

    # Divide the data into classes
    compl_a = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'exclude_class'][col]
    compl_b = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'same_class'][col]
    compl_c = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'random'][col]

    # Create the lines
    rects1 = ax.plot(x, compl_a, label='exclude_class', marker='o')
    rects2 = ax.plot(x, compl_b, label='same_class', marker='o')
    rects3 = ax.plot(x, compl_c, label='random_class', marker='o')

    ax.set_xlabel('Number of Shots')
    
    if idx == 0:
        ax.set_ylabel('Test Passing Rate')
        ax.legend(loc='lower right')
    ax.set_xticks(x)
    ax.set_xticklabels(ungrouped_df['num_shots'].unique())
    ax.title.set_text(col)

st = fig.suptitle("Test Passing by Number of Shots and Invalid Function Type", fontsize="x-large")


# Display the plot
plt.show()

## Logprob Distribution

In [None]:
csv_paths = []
for par_dir, dirnames, _ in os.walk(res_dir):
    for sub_dir in dirnames:
        for dirpath, _, filenames in os.walk(os.path.join(par_dir, sub_dir)):
            if "logprobs.csv" in filenames:
                csv_paths.append(os.path.join(dirpath, "logprobs.csv"))

In [None]:
csv_paths = list(set(csv_paths))
csv_paths

In [None]:
main_df = pd.DataFrame()

for csv_file in csv_paths:
    try:
        df = pd.read_csv(csv_file, sep=",")
    except:
        print("Error reading file: {}".format(csv_file))
    main_df = pd.concat([main_df, df], ignore_index=True)

In [None]:
print(len(main_df))
main_df.info()

In [None]:
main_df.to_csv("./results/q2_1/0718_q2_1_logprobs_agg_ns4,6,8,10.csv")

In [None]:
main_df.groupby(["num_shots", "invalid_fn_type", "response_type", "valid"])["logprob"].count()

In [None]:

pivot_df = main_df.pivot_table(
    index=["num_shots", "invalid_fn_type", "response_type", "valid"], 
    values=[
        "logprob",
        ], 
    aggfunc={
        "logprob": [np.mean, "std", "count", "min", "max"],
        
    })

pivot_df.columns = [f'{aggfunc}_{column}' for column, aggfunc in pivot_df.columns]

In [None]:
pivot_df

### Completion

In [None]:
# filter the dataframe according to your conditions
response_type = "completion"
df = main_df[(main_df['response_type'] == response_type)]

plt.figure(figsize=(10, 6))

# define the valid types
valid_types = df['valid'].unique()

# For each type of validity, create a histogram
for valid_type in valid_types:
    valid_df = df[df['valid'] == valid_type]
    
    # If there is data for this combination
    
    sns.histplot(valid_df['logprob'], kde=True, label=valid_type, stat="count", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
#plt.ylabel('Density')
plt.title('Log Probability Distribution by Validity for "{}"'.format(response_type))
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# define the valid types
cols = ['valid_and_pred', 'invalid_and_not_pred', 'valid_and_not_pred', 'invalid_and_pred']  #  

In [None]:
# filter the dataframe according to your conditions

#cols = ['valid_and_pred', 'invalid_and_not_pred', 'valid_and_not_pred']  #  

num_shots = 10
model = "text-davinci-003"
response_type = "completion"

df = main_df[(main_df['response_type'] == response_type) & (main_df["valid"] != "pred") & (main_df['num_shots'] == num_shots) & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]
    
    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
#plt.ylabel('Density')
plt.title('Log Probability Distribution by Validity for "{}" (num_shots = {})'.format(response_type, num_shots))
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_not_pred', ]  #  

# Create a figure and a set of subplots per num_shot value
fig, axis = plt.subplots(2, 2, figsize=(20, 12), sharex="col")
ax_loc = [(0, 0), (0, 1), (1, 0), (1, 1)]

response_type = "completion"
shots = sorted(main_df["num_shots"].unique())

model = "text-davinci-003"
df = main_df[(main_df['response_type'] == response_type) & (main_df["valid"] != "pred") & (main_df["model"] == model)]

for n_shot, loc in zip(shots, ax_loc):
    ax = axis[loc]

    # For each type of validity, create a histogram
    for col in cols:
        
        # select data for n_shot
        valid_df = df[(df["num_shots"] == n_shot) & (df[col] == 1)]
        
        # If there is data for this combination
        sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False, 
                alpha=0.2, linewidth=.15, ax=ax)

    if loc[0] == 1:
        ax.set_xlabel('Log Probability')
    
    ax.title.set_text(f"num_shots = {n_shot}")
    ax.legend(loc='upper left')

st = fig.suptitle(f"Log Probability Distribution by Validity for '{response_type}' across num_shots", fontsize="x-large")
st.set_y(0.95)
fig.subplots_adjust(top=0.85)

# Display the plot
plt.show()

In [None]:
#cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_not_pred', ]  #  

# filter the dataframe according to your conditions
num_shots = 4
model = "text-davinci-003"
response_type = "completion"

df = main_df[(main_df['response_type'] == response_type) & (main_df['num_shots'] == num_shots) & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# define the valid types
#cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_pred', 'invalid_and_not_pred']

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]
    
    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="count", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
plt.ylabel('Density')
plt.title('Log Probability Distribution by Validity for "{}" (num_shots = {})'.format(response_type, num_shots))
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_not_pred',]  #  

# filter the dataframe according to your conditions
#num_shots = 10
model = "text-davinci-003"
response_type = "completion"
invalid_fn = "random"

df = main_df[(main_df['response_type'] == response_type) & (main_df['invalid_fn_type'] == invalid_fn) & (main_df["valid"] != "pred") & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]
    
    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="count", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
#plt.ylabel('Density')
plt.title('Log Probability Distribution by Validity for "{}" '.format(response_type))
plt.legend()

plt.tight_layout()
plt.show()


### Explanation

In [None]:
main_df[main_df['response_type'] == "explanation"]["valid"].hist()

In [None]:
print("total explantion: {}".format(len(main_df[(main_df['response_type'] == "explanation")])))
print("valid_and_not_pred: {}".format(len(main_df[(main_df['response_type'] == "explanation") & (main_df["valid_and_not_pred"] == 1)])))
print("valid_and_pred: {}".format(len(main_df[(main_df['response_type'] == "explanation") & (main_df["valid_and_pred"] == 1)])))
print("invalid_and_not_pred: {}".format(len(main_df[(main_df['response_type'] == "explanation") & (main_df["invalid_and_not_pred"] == 1)])))
print("invalid_and_pred: {}".format(len(main_df[(main_df['response_type'] == "explanation") & (main_df["invalid_and_pred"] == 1)])))

In [None]:
def _get_valid_and_pred_entries(
    entry: dict, pred_val: str, valid_vals,
) -> dict:
    """Determine variations of whether this entry was valid and predicted or not."""

    # update logprob entry
    if entry["valid"] in ["valid", "invalid"]:
        entry["valid_and_pred"] = (
            1
            if entry["valid"] == "valid"
            and entry["answer"] == pred_val
            else 0
        )
        entry["valid_and_not_pred"] = (
            1
            if entry["valid"] == "valid"
            and entry["answer"] != pred_val
            else 0
        )
        entry["invalid_and_pred"] = (
            1
            if entry["valid"] == "invalid"
            and entry["answer"] == pred_val
            else 0
        )
        entry["invalid_and_not_pred"] = (
            1
            if entry["valid"] == "invalid"
            and entry["answer"] != pred_val
            else 0
        )
    else:
        entry["valid_and_pred"] = (
            1 if entry["valid"] == "pred" and pred_val in valid_vals else 0
        )
        entry["valid_and_not_pred"] = 0
        entry["invalid_and_pred"] = (
            1 if entry["valid"] == "pred" and pred_val not in valid_vals else 0
        )
        entry["invalid_and_not_pred"] = 0

    return entry


NEED_UPDATE = False

if NEED_UPDATE:

    # find valid answer options per sequence
    df_expl = main_df[main_df['response_type'] == "explanation"]


    grouped = df_expl.groupby(["sequence"])

    list_of_dicts = []

    for name, group in grouped:
        valid_options = []
        # convert group dataframe to dictionary
        group_dict = group.to_dict(orient='records')
        
        
        # get valid options
        pred_val = -1
        for elem in group_dict:
            if elem["valid"] == "valid":
                valid_options.append(elem["answer"])
            elif elem["valid"] == "pred":
                pred_val = elem["answer"]

        # iterate over entry to update 
        for elem in group_dict:
            elem = _get_valid_and_pred_entries(elem, pred_val, valid_options)
            list_of_dicts.append(elem)

        # append it to list with the group name as the key
        # list_of_dicts.extend(group_dict)

    df_expl = df_expl.from_dict(list_of_dicts, orient="columns")
    print(df_expl.info())

    print("total: {}".format(len(df_expl)))
    print("valid_and_not_pred: {}".format(len(df_expl[(df_expl['response_type'] == "explanation") & (df_expl["valid_and_not_pred"] == 1)])))
    print("valid_and_pred: {}".format(len(df_expl[(df_expl['response_type'] == "explanation") & (df_expl["valid_and_pred"] == 1)])))
    print("invalid_and_not_pred: {}".format(len(df_expl[(df_expl['response_type'] == "explanation") & (df_expl["invalid_and_not_pred"] == 1)])))
    print("invalid_and_pred: {}".format(len(df_expl[(df_expl['response_type'] == "explanation") & (df_expl["invalid_and_pred"] == 1)])))

    df_compl = main_df[main_df["response_type"] == "completion"]
    main_df = pd.concat([df_expl, df_compl])
    print(main_df.info())
    # store update logprob df
    main_df.to_csv("./results/q2_1/0718_q2_1_logprobs_agg_ns4,6,8,10.csv")


In [None]:
# filter the dataframe according to your conditions
# response_type == "explanation"
# df = main_df[main_df['response_type'] == response_type]

plt.figure(figsize=(10, 6))

# define the valid types
valid_types = df_expl['valid'].unique()

# For each type of validity, create a histogram
for valid_type in valid_types:
    valid_df = df_expl[df_expl['valid'] == valid_type]
    
    # If there is data for this combination
    
    sns.histplot(valid_df['logprob'], kde=True, label=valid_type, stat="count", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
plt.ylabel('Density')
plt.title('Log Probability Distribution by Validity for "explanation"')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# filter the dataframe according to your conditions
num_shots = 4
model = "text-davinci-003"
response_type = "explanation"

df = df_expl[(df_expl['num_shots'] == num_shots) & (df_expl["valid"] != "pred") & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# define the valid types
cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_pred', 'invalid_and_not_pred']

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]
    
    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')

plt.title('Log Probability Distribution by Validity for "{}" (num_shots = {})'.format(response_type, num_shots))
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
def get_valid_and_pred_ratio(df, response_type):
    df_res = df[df["response_type"] == response_type]
    total = len(df_res[df_res["valid"] != "pred"])
    print("response type: {}".format(response_type))
    print("total: {}".format(total))
    print("valid_and_pred: {:.2f}".format(len(df_res[df_res["valid_and_pred"] == 1])/ total))
    print("valid_and_not_pred: {:.2f}".format(len(df_res[df_res["valid_and_not_pred"] == 1])/ total))
    print("invalid_and_pred: {:.2f}".format(len(df_res[df_res["invalid_and_pred"] == 1])/ total))
    print("invalid_and_not_pred: {:.2f}".format(len(df_res[df_res["invalid_and_not_pred"] == 1])/ total))

    total_valid = len(df_res[df_res["valid"] == "valid"])
    total_invalid = len(df_res[df_res["valid"] == "invalid"])
    print("total valid: {}".format(total_valid))
    print("total invalid: {}".format(total_invalid))
    print("valid_and_pred out of valid: {:.2f}".format(len(df_res[df_res["valid_and_pred"] == 1]) / total_valid))
    print("invalid_and_pred out of invalid: {:.2f}".format(len(df_res[df_res["invalid_and_pred"] == 1]) / total_invalid))

In [None]:
# Create a figure and a set of subplots per num_shot value
fig, axis = plt.subplots(2, 2, figsize=(20, 12), sharex="col")
ax_loc = [(0, 0), (0, 1), (1, 0), (1, 1)]

response_type = "explanation"
shots = sorted(df_expl["num_shots"].unique())

model = "text-davinci-003"
df = df_expl[(df_expl["model"] == model) & (df_expl["valid"] != "pred")]

# define the valid types
cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_not_pred', 'invalid_and_pred']  #  

for n_shot, loc in zip(shots, ax_loc):
    ax = axis[loc]

    # For each type of validity, create a histogram
    for col in cols:
        
        # select data for n_shot
        valid_df = df[(df["num_shots"] == n_shot) & (df[col] == 1)]
        
        # If there is data for this combination
        sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False, 
                alpha=0.2, linewidth=.15, ax=ax)

    #ax.set_ylabel('Density')
    if loc[0] == 1:
        ax.set_xlabel('Log Probability')
    #ax.set_xticks(x)
    #ax.set_xticklabels(ungrouped_df['num_shots'].unique())
    ax.title.set_text(f"num_shots = {n_shot}")
    ax.legend(loc='upper left')

# Add some text for labels, title, and custom x-axis tick labels


st = fig.suptitle(f"Log Probability Distribution by Validity for '{response_type}' across num_shots", fontsize="x-large")
st.set_y(0.95)
fig.subplots_adjust(top=0.85)

# fig.legend()

# Display the plot
plt.show()

In [None]:
# filter the dataframe according to your conditions

model = "text-davinci-003"
response_type = "explanation"

df = df_expl[(df_expl["valid"] != "pred") & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# define the valid types
cols = ['valid_and_pred', 'valid_and_not_pred', 'invalid_and_pred', 'invalid_and_not_pred']

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]
    
    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False, 
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')

plt.title('Log Probability Distribution by Validity for "{}"'.format(response_type, num_shots))
plt.legend()

plt.tight_layout()
plt.show()


#### Out of the predicted values, how many are valid and invalid?

In [None]:
# index: num_shots, sequence, invalid_fn_type
# count: valid_and_pred, etc.
# normalise by: total_valid_invalid

pivot_df = main_df[(main_df["valid"] == "pred")].pivot_table(
    index=["num_shots", "invalid_fn_type", "response_type"], 
    values=[
        'valid_and_pred', 'invalid_and_not_pred', 'valid_and_not_pred', 'invalid_and_pred'
        ], 
    aggfunc="sum")

pivot_df["total"] = pivot_df['valid_and_pred'] + pivot_df['valid_and_not_pred'] + pivot_df['invalid_and_pred'] + pivot_df['invalid_and_not_pred']
pivot_df.head()

In [None]:
cols = ['valid_and_pred',  'invalid_and_pred']
for col in cols:
    pivot_df[col] = round(pivot_df[col] / pivot_df["total"], 3)

pivot_df.head()

In [None]:
df = pivot_df.drop(['invalid_and_not_pred', 'valid_and_not_pred'], axis=1)

df.to_latex("results/q2_1/0719_pred_valid_and_invalid_ratio.tex")