In [None]:
import numpy as np
from sklearn import metrics
import pandas as pd

models = [
    "gpt-3.5-turbo", "gpt-4-0314", "text-davinci-003"
]
runs = [
    'final_run_2','final_run_3', 'final_run_4'
]



In [None]:

total_sequences = 225
consistency_results = []
for answer_validity in ['valid', 'all']:
    for model in models:
        for run in runs:
            df = pd.read_csv(
                    f'./consistency/{run}/compute_dependence_with_base_changes=False,sequence_completion_capability=False,sequence_completion_equality.model={model},string_transformation_completion_equality=False/evaluate_sequence_completion_equality/sequence_completion_equality_evaluation_{model}.csv'
            )

            match_accs, model_match_accs, model_consistency_accs, consistent_and_matched_positive, consistent_and_matched_negative = (
                [],
                [],
                [],
                [],
                []
            )
            match_accs_self_consistent = []


            for i, data in df.iterrows():
                match_accs.append(1 if data["generated_completion_matches"] == True else 0)
                match_accs_self_consistent.append(1
                        if data["model_self_consistency_evaluation"].strip() == "Y"
                        else 0)

            for i, data in df.iterrows():
                model_match_accs.append(1 if data["model_completion_matches"] == True else 0)

            for i, data in df.iterrows():
                if answer_validity == 'valid' and data["model_self_consistency_evaluation"].strip() != "Y" and data["model_self_consistency_evaluation"].strip() != "N":
                    continue
                model_consistency_accs.append(
                    1 if data["model_self_consistency_evaluation"].strip() == "Y" else 0
                )

            for i, data in df.iterrows():
                if answer_validity == 'valid' and data["model_self_consistency_evaluation"].strip() != "Y" and data["model_self_consistency_evaluation"].strip() != "N":
                    continue
                if data["generated_completion_matches"]:
                    consistent_and_matched_positive.append(
                        1
                        if data["model_self_consistency_evaluation"].strip() == "Y"
                        else 0
                    )
                else:
                    consistent_and_matched_negative.append(
                        1
                        if data["model_self_consistency_evaluation"].strip() == "N"
                        else 0
                    )



            ground_truth_consistent = np.mean(match_accs)
            self_rule_following_consistency = np.mean(model_match_accs)
            self_comparison_consistency = np.mean(model_consistency_accs)
            consistent_and_matched_positive_acc = np.mean(consistent_and_matched_positive)
            consistent_and_matched_negative_acc = np.mean(consistent_and_matched_negative)
            self_consistency_precision = metrics.precision_score(match_accs, match_accs_self_consistent)
            self_consistency_recall = metrics.recall_score(match_accs, match_accs_self_consistent)
            self_consistency_f1 = metrics.f1_score(match_accs, match_accs_self_consistent)
            print(
                f"""
                For {run} run {model} including {answer_validity} answers
                Evaluated {len(df)} ambiguous sequences of {total_sequences} total.
                Resulting in:
                - {ground_truth_consistent}% ground-truth-consistent (using {len(match_accs)})
                - {self_rule_following_consistency}% self-rule-following-consistency (using {len(model_match_accs)})
                - {self_comparison_consistency}% self-comparison-consistency (using {len(model_consistency_accs)})
                - {consistent_and_matched_positive_acc}% self-comparison-consistency==Y and ground-truth-consistent. (using {len(consistent_and_matched_positive)})
                - {consistent_and_matched_negative_acc}% self-comparison-consistency==N and not ground-truth-consistent. (using {len(consistent_and_matched_negative)})
                - {self_consistency_precision} precision
                - {self_consistency_recall} recall
                - {self_consistency_f1} f1
                """
            )



            consistency_results.append({
                "model": model,
                "run": run,
                "answer_validity": answer_validity,
                "ground_truth_consistent": round(ground_truth_consistent * 100, 2),
                "ground_truth_consistent_num": len(match_accs),
                "self_rule_following_consistency":  round(self_rule_following_consistency * 100, 2),
                "self_rule_following_consistency_len": len(model_match_accs),
                "self_comparison_consistency": round(self_comparison_consistency * 100, 2),
                "self_comparison_consistency_len": len(model_consistency_accs),
                "consistent_and_matched_positive": round(consistent_and_matched_positive_acc * 100, 2),
                "consistent_and_matched_positive_len": len(consistent_and_matched_positive),
                "consistent_and_matched_negative": round(consistent_and_matched_negative_acc * 100, 2),
                "consistent_and_matched_negative_len": len(consistent_and_matched_negative),
                "self_consistency_precision": self_consistency_precision,
                "self_consistency_recall": self_consistency_recall,
                "self_consistency_f1": self_consistency_f1,
            })

consistency_df = pd.DataFrame(consistency_results)
consistency_df.to_csv('./q0_consistency_results_final.csv')

In [None]:
new_df_1 = consistency_df[['model', 'run', 'ground_truth_consistent']].rename(columns={'ground_truth_consistent': 'score'})
new_df_1['score_type'] = 'Ground truth consistent'
new_df_2 = consistency_df[['model', 'run', 'self_rule_following_consistency']].rename(columns={'self_rule_following_consistency': 'score'})
new_df_2['score_type'] = 'Self-rule following consistency'
new_df_3 = consistency_df[['model', 'run', 'self_comparison_consistency']].rename(columns={'self_comparison_consistency': 'score'})
new_df_3['score_type'] = 'Self comparison consistency'

new_df = pd.concat([new_df_1, new_df_2, new_df_3])
new_df = new_df.sort_values(by=['run', 'score']).rename(columns={'run': 'sequence length', 'score_type': 'Consistency measure'})
new_df['sequence length'] = new_df['sequence length'].apply(lambda x: x.split('_')[-1])

In [None]:
import seaborn as sns

In [None]:
new_df = new_df[new_df['Consistency measure'] != 'Self-rule following consistency']

In [None]:
ax = sns.catplot(x="model", y="score", col="sequence length", hue="Consistency measure", data=new_df, kind="bar")
ax.set(ylabel='Consistency', xlabel = '')



In [None]:
import numpy as np
from sklearn import metrics
import pandas as pd
models = [
    "davinci", "gpt-3.5-turbo", "gpt-4-0314", "text-davinci-003"
]
runs = [
    'run_1', 'run_2', 'run_3'
]
total_sequences = 139
capability_results = []
for model in models:
    for run in runs:
        df = pd.read_csv(
                f'./capability/{run}/compute_dependence_with_base_changes=False,sequence_completion_capability.model={model},sequence_completion_equality=False,string_transformation_completion_equality=False/evaluate_sequence_completion_capability/sequence_completion_capability_evaluation_{model}.csv'
        )

        rule_accs, completion_accs = [], []


        for i, data in df.iterrows():
            rule_accs.append(1 if data["generated_rule_matches"] == True else 0)
            completion_accs.append(1
                    if data["generated_completion_matches"] == True
                    else 0)

        result = {
            "model": model,
            "run": run,
            "rule_matches_sequence": round(np.mean(rule_accs) * 100, 2),
            "completion_is_correct": round(np.mean(completion_accs) * 100, 2),
            "len_completed": len(df)
        }
        print(result)
        capability_results.append(result)

capability_df = pd.DataFrame(capability_results)
capability_df.to_csv('./q0_capability_results.csv')

In [None]:
to_corr_df = consistency_df[consistency_df.answer_validity == 'valid'].groupby('model').mean().merge(
    capability_df.groupby('model').mean(), on=['model']
)
corr_df = to_corr_df[[
    'rule_matches_sequence', 'completion_is_correct',
    'ground_truth_consistent', 'ground_truth_consistent_num',
       'self_rule_following_consistency',
       'self_comparison_consistency',
        'self_consistency_precision',
       'self_consistency_recall', 'self_consistency_f1',
    'rule_matches_sequence', 'completion_is_correct'
]].corr()
corr_df.to_csv('./q0_capability_consistency_corr.csv')

In [None]:
corr_df

In [None]:
pip install seaborn

In [None]:
import os
import sys

sys.path.append(os.path.join('../..'))
from src.pipelines import sequence_completions

In [None]:
b = sequence_completions.find_ambiguous_integer_sequences(num_steps_to_check=3)

In [None]:
a = sequence_completions.find_ambiguous_integer_sequences(num_steps_to_check=2)

In [None]:
c = sequence_completions.find_ambiguous_integer_sequences(num_steps_to_check=4)

In [None]:
len([v for l in c.values() for v in l])

In [None]:
# compute capability v consistency
import random
consistency_scores = []
points = []
ambigs = a | b | c
for seq, fns in ambigs.items():
    for _ in range(len(fns)):
        for _ in range(100):
            f1 = random.choice(fns)
            f2 = random.choice(fns)
            consistency_scores.append(1 if f1 == f2 else 0)
    x = sum(consistency_scores) / len(consistency_scores)
    y = len(consistency_scores) /  (len([v for l in ambigs.values() for v in l]) * 100)
    points.append(
        [x * 100 , y * 100]
    )


In [None]:
sum(consistency_scores) / len(consistency_scores)

In [None]:
points

In [None]:
df_1 = to_corr_df[['ground_truth_consistent', 'rule_matches_sequence', 'self_comparison_consistency']].rename(
    columns={'rule_matches_sequence': 'accuracy'}
)
df_1['style'] = 'explanation'

In [None]:
df_2 = to_corr_df[['ground_truth_consistent', 'completion_is_correct', 'self_comparison_consistency']].rename(
    columns={'completion_is_correct': 'accuracy'}
)
df_2['style'] = 'completion'

In [None]:
df_new = pd.concat([df_1, df_2])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2,  sharex=True, figsize=(10,5))
ax = sns.scatterplot(df_new,
    y='ground_truth_consistent', x='accuracy', hue='model',
    ax=axes[0], legend = False
)
sns.lineplot(df_new,
    y='ground_truth_consistent', x='accuracy', style='style',
             ax=axes[0], legend = False
)
ax.set(ylabel='Consistency', xlabel='Accuracy')
ax.set(title='Ground truth consistency v Accuracy (r=0.96)')

ax = sns.scatterplot(df_new,
    y='self_comparison_consistency', x='accuracy', hue='model', ax=axes[1])
sns.lineplot(df_new,
    y='self_comparison_consistency', x='accuracy', style='style',
             ax=axes[1]
)
ax.set(ylabel='Consistency (Self comparison)', xlabel='Accuracy')
ax.set(title='Consistency (Self comparison) v Accuracy (r=-0.685)')

ax.legend(loc=(1.1, 0.5))
#plt.plot([point[1] for point in points[19:-21]], [point[0] for point

#plt.plot([point[1] for point in points[19:-21]], [point[0] for point in points[19:-21]])

In [None]:
ax = sns.scatterplot(to_corr_df,
    y='self_comparison_consistency', x='completion_is_correct', hue='model')
ax.set(ylabel='Consistency (Self comparison)', xlabel='Sequence completion accuracy')
ax.set(title='Consistency (Self comparison) v Sequence completion accuracy (r=-0.49)')
#plt.plot([point[1] for point in points[53:-10]], [point[0] for point in points[53:-10]])

In [None]:
((-0.88 + -0.49) / 2)