# Explore Classifier Results

This notebook explores the results from each of our sameness classification methods. 

In [None]:
import pandas as pd
from vis import describe_results, plot_confidence_intervals
import glob

In [None]:
# Collect all CSV file paths in data/test_results/
csv_files = glob.glob('data/test-results/*.csv')

# Read each CSV file into a DataFrame and store in a list
RESULT_DFS = [pd.read_csv(file) for file in csv_files]

# Extract model names from the filenames
MODELS_USED = [file.split('/')[-1].replace('_eval.csv', '') for file in csv_files]

# Calculate metrics
metrics_df = describe_results(
    dfs=RESULT_DFS,
    models_used=MODELS_USED, 
    y_true_col='classification', 
    y_pred_col='pred',
    even_classes=True
)

# Save the results to a CSV
metrics_df.to_csv('data/outputs/results.csv')

metrics_df.sort_values(by='model_used', ascending=False)

In [None]:
all_result_paths = [
    'gpt_4o_no_examples_eval.csv',
    'tf_idf_similarity_eval.csv',
    'gpt_4o_mini_generated_examples_eval.csv',
    'gpt_4o_mini_overfit_with_original.csv',
    'gpt_35_turbo_generated_examples_eval.csv',
    'gpt_4o_mini_with_pseudo_examples_eval.csv',
    'gpt_4o_mini_with_examples_eval.csv',
    'second_half_similarity_eval.csv',
    'gpt_35_turbo_no_examples_eval.csv',
    'gpt_4o_mini_no_examples_eval.csv',
    'gpt_4o_with_pseudo_examples_eval.csv',
    'gpt_4o_with_examples_eval.csv',
    'semantic_similarity_eval.csv',
    'gpt_35_turbo_with_pseudo_examples_eval.csv',
    'gpt_4o_generated_examples_eval.csv',
    'gpt_4o_overfit_pure_eval.csv',
    'gpt_4o_mini_overfit_pure_eval.csv',
    'gpt_35_turbo_with_examples_eval.csv'
]

base_result_paths = [
    'gpt_4o_no_examples_eval.csv',
    'tf_idf_similarity_eval.csv',
    'second_half_similarity_eval.csv',
    'gpt_35_turbo_no_examples_eval.csv',
    'gpt_4o_mini_no_examples_eval.csv',
    'semantic_similarity_eval.csv',
]

gpt_4o_paths = [
    'gpt_4o_no_examples_eval.csv',
    'gpt_4o_with_pseudo_examples_eval.csv',
    'gpt_4o_with_examples_eval.csv',
    'gpt_4o_generated_examples_eval.csv',
    'gpt_4o_overfit_pure_eval.csv',
]

In [None]:
from vis import plot_confidence_intervals

# Example usage
data_dir = 'data/test-results copy'
plot_confidence_intervals(
    csv_paths=base_result_paths, 
    output_path='imgs/base-results', 
    title='Base',
    confidence=0.95)

plot_confidence_intervals(
    csv_paths=gpt_4o_paths, 
    output_path='imgs/gpt-4o-results', 
    title='gpt-4o',
    confidence=0.95)


## What did the models get wrong?
Use the following cells to explore the examples that each model struggles with. 

In [None]:
results = {model_name: result for model_name, result in zip(MODELS_USED, RESULT_DFS)}
MODELS_USED

In [None]:
df = results['gpt_4o_generated_examples']
wrong = df[df['classification'] != df['pred']]
print(wrong.shape)
wrong

In [None]:
def format_dataframe_rows(df):
    """
    Formats each row of the dataframe into a specific string format.

    Parameters:
        df (pd.DataFrame): Input dataframe with columns:
                           - Producer Name_x
                           - Producer Name_y
                           - Abbreviation Name_x
                           - Abbreviation Name_y
                           - classification

    Returns:
        list of str: List of formatted strings for each row in the dataframe.
    """
    formatted_rows = []
    for _, row in df.iterrows():
        formatted_string = (
            "example_user:\n"
            "Are these two names representing the same entity?\n"
            "Respond with True or False.\n"
            f"Name 1: {row['Producer Name_x']}\n"
            f"Name 2: {row['Producer Name_y']}\n"
            f"Abbreviation 1: {row['Abbreviation Name_x']}\n"
            f"Abbreviation 2: {row['Abbreviation Name_y']}\n"
            "example_assistant:\n"
            f'{{"is_same_entity":{bool(row["classification"])}}}'
        )
        formatted_rows.append(formatted_string)
    return formatted_rows

examples = format_dataframe_rows(wrong)
print(examples[0])

In [None]:
for index, row in wrong.iterrows():
    row = row.to_list()
    print(row[1], '|', row[2], '|', row[3], '|', row[4], '|', row[5], '|', row[6])

In [None]:
gpt_4o_mini_no_examples_result = RESULT_DFS[2]
wrong = gpt_4o_mini_no_examples_result[gpt_4o_mini_no_examples_result['classification'] != gpt_4o_mini_no_examples_result['pred']]
print(wrong.shape)
wrong

In [None]:
gpt_35_turbo_no_examples_result = RESULT_DFS[1]
wrong = gpt_35_turbo_no_examples_result[gpt_35_turbo_no_examples_result['classification'] != gpt_35_turbo_no_examples_result['pred']]
print(wrong.shape)
wrong