In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [None]:
import matplotlib.pylab as pylab
_DEFAULT_PARAMS = {'legend.fontsize': '16',
          'figure.figsize': (8, 5),
         'axes.labelsize': '16',
         'axes.titlesize':'16',
         'xtick.labelsize':'16',
         'ytick.labelsize':'16'}
pylab.rcParams.update(_DEFAULT_PARAMS)

sns.set_style("whitegrid")
sns.set_context("paper")

In [None]:
res_dir = "results/q2_1"

In [None]:
sorted(os.listdir(res_dir))

In [None]:
csv_paths = []
for par_dir, dirnames, _ in os.walk(res_dir):
    for sub_dir in dirnames:
        for dirpath, _, filenames in os.walk(os.path.join(par_dir, sub_dir)):
            if "results.csv" in filenames:
                csv_paths.append(os.path.join(dirpath, "results.csv"))

In [None]:
csv_paths = list(set(csv_paths))
csv_paths

In [None]:
main_df = pd.DataFrame()

for csv_file in csv_paths:
    df = pd.read_csv(csv_file, sep=",")
    main_df = pd.concat([main_df, df], ignore_index=True)

In [None]:
len(csv_paths)

In [None]:
print(len(main_df))
main_df.info()

In [None]:
len(main_df[main_df["invalid_fn_type"] == "random"])

In [None]:
main_df.groupby(["num_shots", "invalid_fn_type"])["test_passing_completion"].count()

In [None]:
main_df.groupby(["invalid_fn_type"])["num_invalid"].count()

In [None]:
pivot_df = main_df.pivot_table(
    index=["num_shots", "invalid_fn_type"],
    values=[
        "test_passing_completion", "test_passing_explanation",
        "org_func",
        ],
    aggfunc={
        "test_passing_completion": "sum",
        "test_passing_explanation": "sum",
        "org_func": "count",
    })

In [None]:
pivot_df

In [None]:
final_df = pivot_df.copy()
final_df["test_passing_completion"] = final_df["test_passing_completion"] / final_df["org_func"]
final_df["test_passing_explanation"] = final_df["test_passing_explanation"] / final_df["org_func"]
final_df["n_examples"] = final_df["org_func"]
final_df = final_df.drop(columns=["org_func"])

In [None]:
final_df

In [None]:
print(final_df["test_passing_completion"].mean())
print(final_df["test_passing_explanation"].mean())


In [None]:
final_df.to_csv(os.path.join(res_dir, "1018_q2_1_agg_ns4,6,8,10.csv"), index=False)

In [None]:
ungrouped_df = final_df.reset_index()
ungrouped_df.head()

In [None]:
# Create a figure and a set of subplots

pylab.rcParams.update(_DEFAULT_PARAMS)

fig, axis = plt.subplots(1, ncols=2, figsize=(12, 6), sharey="row")

cols = ["test_passing_completion", "test_passing_explanation"]

# The amount of space for each group of bars along the x-axis
width = 0.2

# The x locations for the groups
x = np.arange(len(ungrouped_df['num_shots'].unique()))

for idx, col in enumerate(cols):
    ax = axis[idx]

    # Divide the data into classes
    compl_a = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'exclude_class'][col]
    compl_b = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'same_class'][col]
    compl_c = ungrouped_df[ungrouped_df['invalid_fn_type'] == 'random'][col]

    # Create the lines
    rects1 = ax.plot(x, compl_a, label='exclude_class', marker='o')
    rects2 = ax.plot(x, compl_b, label='same_class', marker='o')
    rects3 = ax.plot(x, compl_c, label='random_class', marker='o')

    ax.set_xlabel('Number of Shots')

    if idx == 0:
        ax.set_ylabel('Test Passing Rate')
        ax.legend(loc='lower right')
    ax.set_xticks(x)
    ax.set_xticklabels(ungrouped_df['num_shots'].unique())
    ax.title.set_text(col)
    ax.set_ylim(bottom=0.2, top=1)

st = fig.suptitle("Test Passing by Number of Shots and Invalid Function Type", fontsize="x-large")

plt.tight_layout()
fig.savefig("1018_logprob_rate.pdf", format='pdf', bbox_inches='tight')

In [None]:
main_df.columns

In [None]:
pivot_df = main_df.pivot_table(
    index=["num_shots", "invalid_fn_type"],
    values=[
        "test_passing_completion",
        "org_func",
        ],
    aggfunc={
        "test_passing_completion": "sum",
        "org_func": "count",
    })

pivot_df.head()

In [None]:
final_df = pivot_df.copy()
final_df["test_passing_completion"] = round(final_df["test_passing_completion"] / final_df["org_func"], 3)
final_df["n_examples"] = final_df["org_func"]
final_df = final_df.drop(columns=["org_func"])

In [None]:
df = final_df.reset_index()
df.head()

In [None]:

## LINE Plot
# completion for across number of shots
df = main_df

fig = plt.figure(figsize=_DEFAULT_PARAMS.get('figure.figsize'))

# Define the order of hues (class labels) you want
hue_order = ["random", "exclude_class", "same_class"]

# Define a custom color palette for specific labels
custom_palette = {
    'random': "blue",
    'exclude_class': "green",
    'same_class': "orange",

}

# Calculate normalized histograms for each class using Seaborn
g = sns.lineplot(data=df, x='num_shots',
             y="test_passing_completion",
             hue='invalid_fn_type',
             hue_order=hue_order,
             palette=custom_palette,
             marker="o",
             errorbar=("sd", 0.0),
             #legend=False,
             )

# Add labels and title
plt.xlabel('Number of Shots')
plt.xticks([4, 6, 8, 10])
plt.ylabel('Rate')
plt.ylim(bottom=0.72, top=0.90)
plt.title('Rate for Correct Completions Assigned Consistently Non-trivial Mass')
leg = plt.legend(fontsize=16, title="invalid func type")
plt.setp(leg.get_title(), fontsize='x-large')
plt.tight_layout()
fig.savefig("1018_logprob_rate_completions.pdf", format='pdf', bbox_inches='tight')

# Logprob Distribution

In [None]:
csv_paths = []
for par_dir, dirnames, _ in os.walk(res_dir):
    for sub_dir in dirnames:
        for dirpath, _, filenames in os.walk(os.path.join(par_dir, sub_dir)):
            if "logprobs.csv" in filenames:
                csv_paths.append(os.path.join(dirpath, "logprobs.csv"))

In [None]:
csv_paths = list(set(csv_paths))
csv_paths

In [None]:
main_df = pd.DataFrame()

for csv_file in csv_paths:
    try:
        df = pd.read_csv(csv_file, sep=",")
    except:
        print("Error reading file: {}".format(csv_file))
    main_df = pd.concat([main_df, df], ignore_index=True)

In [None]:
len(csv_paths)

In [None]:
print(len(main_df))
main_df.info()

In [None]:
main_df.to_csv(os.path.join(res_dir, "1018_q2_1_logprobs_agg_ns4,6,8,10.csv"), index=False)

In [None]:
main_df[["valid_and_pred", "invalid_and_pred","valid_and_not_pred", "invalid_and_not_pred"]].value_counts()

In [None]:
main_df["correct"] = main_df["valid"]
main_df.loc[main_df["valid"] == "valid", "correct"] = "correct"
main_df.loc[main_df["valid"] == "invalid", "correct"] = "incorrect"

main_df["correct_and_pred"] = main_df["valid_and_pred"]
main_df["incorrect_and_pred"] = main_df["invalid_and_pred"]
main_df["correct_and_not_pred"] = main_df["valid_and_not_pred"]
main_df["incorrect_and_not_pred"] = main_df["invalid_and_not_pred"]
main_df = main_df.drop(["valid", "valid_and_pred", "invalid_and_pred", "valid_and_not_pred", "invalid_and_not_pred"], axis=1)

In [None]:
# Use numpy select to create the "class_label" column based on conditions
conditions = [
    main_df['correct_and_pred'] == 1,
    main_df['incorrect_and_not_pred'] == 1,
    main_df['correct_and_not_pred'] == 1,
    main_df['incorrect_and_pred'] == 1
]

choices = ['correct_and_pred', 'incorrect_and_not_pred', 'correct_and_not_pred', 'incorrect_and_pred']

main_df.loc[:, 'class_label'] = np.select(conditions, choices, default=None)

In [None]:
main_df.groupby(["num_shots"])[['correct_and_pred', 'correct_and_not_pred', 'incorrect_and_pred', 'incorrect_and_not_pred',]].value_counts()

In [None]:
main_df.to_csv(os.path.join(res_dir, "1018_q2_1_logprobs_w_class_labels.csv"), index=False)

In [None]:
main_df.groupby(["num_shots", "invalid_fn_type", "response_type", "correct"])["logprob"].count()

In [None]:
main_df["num_valid"].value_counts()

In [None]:
fn_type = "random"
conditions = (main_df["response_type"] == "completion") & (main_df["num_shots"] > 2) & (main_df["invalid_fn_type"] == fn_type) & (main_df["correct"] != "pred")
main_df[conditions].groupby(["num_shots"])["class_label"].value_counts()
round(main_df[conditions].groupby(["num_shots"])["class_label"].value_counts() / main_df[conditions].groupby(["num_shots"])["class_label"].count() * 100, 2)

In [None]:
round(main_df[conditions].groupby(["num_shots"])["correct"].value_counts() / main_df[conditions].groupby(["num_shots"])["correct"].count() * 100, 2)

In [None]:

pivot_df = main_df.pivot_table(
    index=["num_shots", "invalid_fn_type", "response_type", "correct"],
    values=[
        "logprob",
        ],
    aggfunc={
        "logprob": [np.mean, "std", "count", "min", "max"],

    })

pivot_df.columns = [f'{aggfunc}_{column}' for column, aggfunc in pivot_df.columns]

In [None]:
pivot_df

### Completion

In [None]:
## DENSITY Plot
# completion for specific number of shots across function types
params = {'legend.fontsize': '16',
          'figure.figsize': (8, 5),
         'axes.labelsize': '16',
         'axes.titlesize':'16',
         'xtick.labelsize':'16',
         'ytick.labelsize':'16'}
pylab.rcParams.update(params)

num_shots = 8
model = "text-davinci-003"
response_type = "completion"

conditions = (main_df['response_type'] == response_type) & (main_df["correct"] != "pred") & (main_df['num_shots'] == num_shots) & (main_df["model"] == model)
df = main_df[conditions]

fig = plt.figure(figsize=params.get('figure.figsize'))

# Define the order of hues (class labels) you want
#hue_order = df['class_label'].unique().tolist()
hue_order = ['correct_and_not_pred', 'incorrect_and_not_pred', 'correct_and_pred', 'incorrect_and_pred',]

# Define a custom color palette for specific labels
custom_palette = {
    'correct_and_not_pred': 'green',
    'correct_and_pred': 'blue',
    'incorrect_and_not_pred': 'orange',
    'incorrect_and_pred': 'red',

}

# Calculate normalized histograms for each class using Seaborn
sns.histplot(data=df, x='logprob', hue='class_label', palette=custom_palette, bins=15, common_norm=False, kde=True, stat='density',
            alpha=0.2, linewidth=.15)

# Add labels and title
plt.xlabel('Log Probability')
plt.ylim(top=0.55)
plt.xlim(left=-35)
plt.ylabel('Normalized Density')
leg = plt.legend(labels=hue_order, fontsize=16, title="class label", loc="upper left")
plt.setp(leg.get_title(), fontsize='x-large')
plt.title('Distribution of Log Probabilities by Class Label for Completion (num_shots = {})'.format(num_shots))
plt.tight_layout()
fig.savefig("1018_logprob_distribution.pdf", format='pdf', bbox_inches='tight')

In [None]:
df.groupby(["class_label"])["logprob"].median()

In [None]:
## DENSITY Plot
# completion for specific number of shots across function types
num_shots = 8
model = "text-davinci-003"
response_type = "completion"
fn_type = "random"

conditions = (main_df['num_shots'] == num_shots) & (main_df['invalid_fn_type'] == fn_type) & (main_df['response_type'] == response_type) & (main_df["correct"] != "pred") & (main_df["model"] == model)
df = main_df[conditions]

fig = plt.figure(figsize=_DEFAULT_PARAMS.get('figure.figsize'))

# Define the order of hues (class labels) you want
#hue_order = df['class_label'].unique().tolist()
hue_order = ['correct', 'incorrect']

# Define a custom color palette for specific labels
custom_palette = {
    'correct': 'blue',
    'incorrect': 'orange',
}

# Calculate normalized histograms for each class using Seaborn
sns.histplot(data=df, x='logprob', hue='correct', palette=custom_palette, bins=50, common_norm=True, kde=True, stat='density',
            alpha=0.2, linewidth=.15)

# Add labels and title
plt.xlabel('Log Probability')
plt.ylabel('Normalized Density')
plt.title('Normalized Distribution of Log Probabilities by Correctness for Completion (num_shots = {})'.format(num_shots))
plt.tight_layout()
plt.ylim(top=0.185)
plt.legend(labels=['incorrect', 'correct',])
fig.savefig("1018_normalized_logprob_distribution.pdf", format='pdf', bbox_inches='tight')


In [None]:
cols = ["correct_and_pred", "incorrect_and_not_pred", "correct_and_not_pred", "incorrect_and_pred",]

# Create a figure and a set of subplots per num_shot value
fig, axis = plt.subplots(2, 2, figsize=(20, 12), sharex="col")
ax_loc = [(0, 0), (0, 1), (1, 0), (1, 1)]

response_type = "completion"
shots = sorted(main_df["num_shots"].unique())

model = "text-davinci-003"
df = main_df[(main_df['response_type'] == response_type) & (main_df["correct"] != "pred") & (main_df["model"] == model)]

for n_shot, loc in zip(shots, ax_loc):
    ax = axis[loc]

    # For each type of validity, create a histogram
    for col in cols:

        # select data for n_shot
        valid_df = df[(df["num_shots"] == n_shot) & (df[col] == 1)]

        # If there is data for this combination
        sns.histplot(valid_df['logprob'], kde=True, label=col, stat="density", common_norm=False,
                alpha=0.2, linewidth=.15, ax=ax)

    if loc[0] == 1:
        ax.set_xlabel('Log Probability')

    ax.title.set_text(f"num_shots = {n_shot}")
    ax.legend(loc='upper left')

st = fig.suptitle(f"Log Probability Distribution by Validity for '{response_type}' across num_shots", fontsize="x-large")
st.set_y(0.95)
fig.subplots_adjust(top=0.85)

# Display the plot
plt.show()

In [None]:
## DENSITY Plot
# completion for all functions across number of shots
num_shots = 10
model = "text-davinci-003"
response_type = "completion"
invalid_fn = "random"
# & (main_df['invalid_fn_type'] == invalid_fn)
df = main_df[(main_df['response_type'] == response_type) & (main_df["correct"] != "pred") & (main_df['num_shots'] == num_shots)  & (main_df["model"] == model) & (main_df['invalid_fn_type'] == invalid_fn)]

plt.figure(figsize=(8, 5))

# Define the order of hues (class labels) you want
#hue_order = df['class_label'].unique().tolist()
hue_order = ['correct_and_pred', 'correct_and_not_pred', 'incorrect_and_pred', 'incorrect_and_not_pred', ]

# Define a custom color palette for specific labels
custom_palette = {
    'correct_and_not_pred': 'green',
    'correct_and_pred': 'blue',
    'incorrect_and_not_pred': 'orange',
    'incorrect_and_pred': 'red',

}

# Calculate normalized histograms for each class using Seaborn
sns.histplot(data=df, x='logprob', hue='class_label', palette=custom_palette, bins=60, common_norm=False, kde=True, stat='density',
            alpha=0.2, linewidth=.15)

# Add labels and title
plt.xlabel('Log Probability')
plt.ylabel('Density')
plt.ylim(top=0.5)
#plt.legend(title="class label")
plt.title('Distribution of Log Probabilities by Class Label for Completion (num_shots = {})'.format(num_shots))
plt.tight_layout()

In [None]:
plt.figure(figsize=(10, 6))

# Calculate normalized histograms for each class using Seaborn
sns.histplot(data=df, x='logprob', hue='class_label', palette=custom_palette, bins=60, common_norm=True, kde=True, stat='density',
            alpha=0.2, linewidth=.15)

# Add labels and title
plt.xlabel('Log Probability')
plt.ylabel('Noramlized Density')
plt.title('Normalized Distribution of Log Probabilities by Class Label for Completion (num_shots = {})'.format(num_shots))
plt.tight_layout()

In [None]:
## DENSITY Plot
# completion for all functions across all number of shots
# num_shots = 10
model = "text-davinci-003"
response_type = "completion"
# invalid_fn = "random"
# & (main_df['invalid_fn_type'] == invalid_fn)
df = main_df[(main_df['response_type'] == response_type) & (main_df["correct"] != "pred") & (main_df["model"] == model)]

plt.figure(figsize=(8, 5))

# Define the order of hues (class labels) you want
hue_order = ['correct_and_pred', 'correct_and_not_pred', 'incorrect_and_pred', 'incorrect_and_not_pred', ]

# Define a custom color palette for specific labels
custom_palette = {
    'correct_and_not_pred': 'green',
    'correct_and_pred': 'blue',
    'incorrect_and_not_pred': 'orange',
    'incorrect_and_pred': 'red',

}

# Calculate normalized histograms for each class using Seaborn
sns.histplot(data=df, x='logprob', hue='class_label', palette=custom_palette, bins=80, common_norm=False, kde=True, stat='density',
            alpha=0.2, linewidth=.15)

# Add labels and title
plt.xlabel('Log Probability')
plt.ylabel('Density')
plt.ylim(top=0.65)
plt.title('Distribution of Log Probabilities by Class Label for Completion (num_shots = {})'.format("all"))
plt.tight_layout()

In [None]:
## COUNT Plot

cols = ["correct_and_pred", "incorrect_and_not_pred", "correct_and_not_pred", "incorrect_and_pred",]

# filter the dataframe according to your conditions
num_shots = 10
model = "text-davinci-003"
response_type = "completion"

df = main_df[(main_df['response_type'] == response_type) & (main_df['num_shots'] == num_shots) & (main_df["model"] == model)]

plt.figure(figsize=(10, 6))

# For each type of validity, create a histogram
for col in cols:
    valid_df = df[df[col] == 1]

    # If there is data for this combination
    sns.histplot(valid_df['logprob'], kde=True, label=col, stat="count", common_norm=False,
            alpha=0.2, linewidth=.15)

plt.xlabel('Log Probability')
plt.ylabel('Count')
plt.title('Log Probability Distribution by Validity for "{}" (num_shots = {})'.format(response_type, num_shots))
plt.legend()

plt.tight_layout()
plt.show()


## KL Divergence

In [None]:
from scipy.special import rel_entr
from scipy.ndimage import gaussian_filter

def calculate_kl_for_num_shots(main_df, p_label="correct_and_pred", q_label="correct_and_not_pred", num_shots = None, nbins=40, sigma_smoothing=1):

    conditions =  (main_df["invalid_fn_type"] == "random") & (main_df["response_type"] == "completion") & (main_df["model"] == "text-davinci-003")
    if num_shots is not None:
        conditions = conditions & (main_df["num_shots"] == num_shots)

    p_label = main_df[conditions & (main_df["class_label"] == p_label)]["logprob"] # P
    q_label = main_df[conditions & (main_df["class_label"] == q_label)]["logprob"] # Q

    # get bins according to min and max of P and Q distributions
    bins = np.linspace(max(p_label.max(), q_label.max()), min(p_label.min(), q_label.min()), num=nbins)[::-1]  # min to max

    p_density = np.histogram(p_label, bins=bins, density=True)[0]
    q_density = np.histogram(q_label, bins=bins, density=True)[0]

    if sigma_smoothing is not None:
        # apply gaussian filter smoothing to remove 0 entries
        p_smoothed = gaussian_filter(p_density, sigma_smoothing)
        q_smoothed = gaussian_filter(q_density, sigma_smoothing)
        return round(sum(rel_entr(p_smoothed, q_smoothed, where=q_density > 0)), 3)
    else:
        return round(sum(rel_entr(p_density, q_density, where=q_density > 0)), 3)


def print_kl_divergences(p_label, q_label):

    print("KL(P||Q) in bits between P := {} and Q := {}".format(p_label, q_label))
    print("Across num_shots & w/o smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label, sigma_smoothing=None))
    print("Across num_shots & w/ smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label,))
    print("Across num_shots & w/ 2-sigma smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label, sigma_smoothing=2))
    print("\nnum_shots=8 & w/o smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label,num_shots=8, sigma_smoothing=None))
    print("num_shots=8 & w/ smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label,num_shots=8))
    print("num_shots=8 & w/ 2-sigma smoothing: ", calculate_kl_for_num_shots(main_df, p_label=p_label, q_label=q_label, num_shots=8, sigma_smoothing=2))

In [None]:
p_label = "correct_and_pred"
q_label = "correct_and_pred"

print_kl_divergences(p_label, q_label)

In [None]:
p_label = "correct_and_pred"
q_label = "correct_and_not_pred"

print_kl_divergences(p_label, q_label)

In [None]:
p_label = "correct_and_pred"
q_label = "incorrect_and_not_pred"

print_kl_divergences(p_label, q_label)

In [None]:
p_label = "correct_and_pred"
q_label = "incorrect_and_pred"

print_kl_divergences(p_label, q_label)

In [None]:
p_label = "correct_and_not_pred"
q_label = "correct_and_pred"
print_kl_divergences(p_label, q_label)

In [None]:
p_label = "correct_and_not_pred"
q_label = "incorrect_and_not_pred"
print_kl_divergences(p_label, q_label)