In [7]:
import numpy as np
import pandas as pd
models = [
    "davinci", "gpt-3.5-turbo", "gpt-4-0314", "text-davinci-003"
]
runs = [
    'run_1', 'run_2', 'run_3'
]
total_sequences = 225
results = []
for answer_validity in ['valid', 'all']:
    for model in models:
        for run in runs:
            df = pd.read_csv(
                    f'./{run}/compute_dependence_with_base_changes=False,sequence_completion_equality.model={model},string_transformation_completion_equality=False/evaluate_sequence_completion_equality/sequence_completion_equality_evaluation_{model}.csv'
            )
            
            match_accs, model_match_accs, model_consistency_accs, consistent_and_matched_positive, consistent_and_matched_negative = (
                [],
                [],
                [],
                [],
                []
            )

            for i, data in df.iterrows():
                if answer_validity == 'valid' and  data["generated_completion_matches"] is not True and data["generated_completion_matches"] is not False:
                    continue
                match_accs.append(1 if data["generated_completion_matches"] == True else 0)

            for i, data in df.iterrows():
                if answer_validity == 'valid' and data["model_completion_matches"] is not True and data["model_completion_matches"] is not False:
                    continue
                model_match_accs.append(1 if data["model_completion_matches"] == True else 0)

            for i, data in df.iterrows():
                if answer_validity == 'valid' and data["model_self_consistency_evaluation"].strip() != "Y" and data["model_self_consistency_evaluation"].strip() != "N":
                    continue
                model_consistency_accs.append(
                    1 if data["model_self_consistency_evaluation"].strip() == "Y" else 0
                )

            for i, data in df.iterrows():
                if answer_validity == 'valid' and data["model_self_consistency_evaluation"].strip() != "Y" and data["model_self_consistency_evaluation"].strip() != "N":
                    continue
                if answer_validity == 'valid' and  data["generated_completion_matches"] is not True and data["generated_completion_matches"] is not False:
                    continue
                if data["generated_completion_matches"]:
                    consistent_and_matched_positive.append(
                        1
                        if data["model_self_consistency_evaluation"].strip() == "Y"
                        else 0
                    )
                else:
                    consistent_and_matched_negative.append(
                        1
                        if data["model_self_consistency_evaluation"].strip() == "N"
                        else 0
                    )
                    


            ground_truth_consistent = np.mean(match_accs)
            self_rule_following_consistency = np.mean(model_match_accs)
            self_comparison_consistency = np.mean(model_consistency_accs)
            consistent_and_matched_positive_acc = np.mean(consistent_and_matched_positive)
            consistent_and_matched_negative_acc = np.mean(consistent_and_matched_negative)
            print(
                f"""
                For {run} run {model} including {answer_validity} answers
                Evaluated {len(df)} ambiguous sequences of {total_sequences} total.
                Resulting in:
                - {ground_truth_consistent}% ground-truth-consistent (using {len(match_accs)})
                - {self_rule_following_consistency}% self-rule-following-consistency (using {len(model_match_accs)})
                - {self_comparison_consistency}% self-comparison-consistency (using {len(model_consistency_accs)})
                - {consistent_and_matched_positive_acc}% self-comparison-consistency==Y and ground-truth-consistent. (using {len(consistent_and_matched_positive)})
                - {consistent_and_matched_negative_acc}% self-comparison-consistency==N and not ground-truth-consistent. (using {len(consistent_and_matched_negative)})
                """
            )


            results.append({
                "model": model,
                "run": run,
                "answer_validity": answer_validity,
                "ground_truth_consistent": round(ground_truth_consistent * 100, 2),
                "ground_truth_consistent_num": len(match_accs),
                "self_rule_following_consistency":  round(self_rule_following_consistency * 100, 2), 
                "self_rule_following_consistency_len": len(model_match_accs),
                "self_comparison_consistency": round(self_comparison_consistency * 100, 2),
                "self_comparison_consistency_len": len(model_consistency_accs),
                "consistent_and_matched_positive": round(consistent_and_matched_positive_acc * 100, 2),
                "consistent_and_matched_positive_len": len(consistent_and_matched_positive),
                "consistent_and_matched_negative": round(consistent_and_matched_negative_acc * 100, 2),
                "consistent_and_matched_negative_len": len(consistent_and_matched_negative)
            })

pd.DataFrame(results).to_csv('./q0_results.csv')


                For run_1 run davinci including valid answers
                Evaluated 222 ambiguous sequences of 225 total.
                Resulting in:
                - 0.36363636363636365% ground-truth-consistent (using 33)
                - 0.3333333333333333% self-rule-following-consistency (using 33)
                - 0.9504504504504504% self-comparison-consistency (using 222)
                - 1.0% self-comparison-consistency==Y and ground-truth-consistent. (using 12)
                - 0.047619047619047616% self-comparison-consistency==N and not ground-truth-consistent. (using 21)
                

                For run_2 run davinci including valid answers
                Evaluated 220 ambiguous sequences of 225 total.
                Resulting in:
                - 0.29411764705882354% ground-truth-consistent (using 34)
                - 0.4117647058823529% self-rule-following-consistency (using 34)
                - 0.9681818181818181% self-comparison-consistency (using


                For run_1 run gpt-3.5-turbo including all answers
                Evaluated 217 ambiguous sequences of 225 total.
                Resulting in:
                - 0.8294930875576036% ground-truth-consistent (using 217)
                - 0.8433179723502304% self-rule-following-consistency (using 217)
                - 0.9447004608294931% self-comparison-consistency (using 217)
                - 0.9514563106796117% self-comparison-consistency==Y and ground-truth-consistent. (using 206)
                - 0.09090909090909091% self-comparison-consistency==N and not ground-truth-consistent. (using 11)
                

                For run_2 run gpt-3.5-turbo including all answers
                Evaluated 213 ambiguous sequences of 225 total.
                Resulting in:
                - 0.8309859154929577% ground-truth-consistent (using 213)
                - 0.8497652582159625% self-rule-following-consistency (using 213)
                - 0.9624413145539906% self-comp

In [61]:
consistent_functions = []
for model in models:
    df = pd.read_csv(
            f'./run_1/compute_dependence_with_base_changes=False,sequence_completion_equality.model={model},string_transformation_completion_equality=False/evaluate_sequence_completion_equality/sequence_completion_equality_evaluation_{model}.csv'
    )
    for i, data in df.iterrows():
        if data["generated_completion_matches"] is True:
            consistent_functions.append({
                "fn": data['original_function'],
                "model": model
            })
            
pd.DataFrame(
    consistent_functions
).to_csv('./consistent_functions_by_model.csv')
            
    