# Import Dependencies

In [2]:
import pandas as pd
import os
import glob

# Define Evaluation Function

In [68]:
full_summary_df, aggregate_summary_df, deep_dive_data = evaluate_model_performance()
deep_dive_data['false_negatives']

Unnamed: 0,chunk_id,chunk_text,manual_Information Technology_tag,gpt-4.1-mini_Information Technology_tag,manual_relevant_text,Transcript,Model,Chunking Strategy
0,46,"to the NDI on page 18 for $355,000 fund switch...",True,False,"so let's turn to page 84, this is an NDI for C...",ded,gpt-4.1-mini,recursive-character
1,260,"is. Is it number of students who have done X, ...",True,False,I have a team that does programmatic monitorin...,dss-non-medicaid-pt2,gpt-4.1-mini,recursive-character
2,58,no longer include the cost for security servic...,True,False,"So historically, a vendor communication servic...",sa-dolir,gpt-4.1-mini,recursive-character


In [63]:
def evaluate_model_performance(
    data_dir:str = '../data',
    topic:str = 'Information Technology',
    model_names:list = ['gpt-4.1-nano', 'gpt-4.1-mini'],
    chunking_strategies:list = ['recursive-character', 'semantic-percentile'],
    transcripts:list = ['ded', 'dss-non-medicaid-pt2', 'sa-dolir'],
    deep_dive_models:list= ['gpt-4.1-mini'],
    deep_dive_chunking_strategies=['recursive-character']
):
    full_summary_data = []

    deep_dive_data = {
        'false_positives': [],
        'false_negatives': []
    }

    for transcript in transcripts:
        for strategy in chunking_strategies:
            # Construct search path for matching CSVs
            search_path = f'{data_dir}/{transcript}-chunked-{strategy}-tagged.csv'
            matching_files = glob.glob(search_path)

            for file_path in matching_files:
                df = pd.read_csv(file_path)
  
                for model in model_names:
                    manual_tag_col = f"manual_{topic}_tag"
                    model_tag_col = f"{model}_{topic}_tag"
                    model_conf_col = f"{model}_{topic}_tag_confidence"
                    model_rel_text_col = f"{model}_{topic}_tag_relevant_section"

                    if manual_tag_col not in df.columns or model_tag_col not in df.columns:
                        continue  # Skip if necessary columns are missing

                    total = len(df)
                    correct = (df[manual_tag_col] == df[model_tag_col]).sum()
                    true_negatives = ((df[manual_tag_col] == False) & (df[model_tag_col] == False)).sum()
                    false_positives = ((df[manual_tag_col] == False) & (df[model_tag_col] == True)).sum()
                    false_negatives = ((df[manual_tag_col] == True) & (df[model_tag_col] == False)).sum()
                    true_positives = ((df[manual_tag_col] == True) & (df[model_tag_col] == True)).sum()

                    true_positive_rate = true_positives / (true_positives + false_negatives)
                    true_negative_rate = true_negatives / (true_negatives + false_positives)
                    false_positive_rate = false_positives / (false_positives + true_negatives)
                    false_negative_rate = false_negatives / (true_positives + false_negatives)
                    f1_score = (2 * true_positives) / ( (2 * true_positives) + false_positives + false_negatives)
  
                    full_summary_data.append({
                        "Transcript": transcript,
                        "Chunking Strategy": strategy,
                        "Model": model,
                        "Total Chunks": total,
                        "Correct Classifications": correct,
                        "Accuracy Rate": correct / total if total else None,
                        "True Positives": true_positives,
                        "True Positive Rate": true_positive_rate,
                        "True Negatives": true_negatives,
                        "True Negative Rate": true_negative_rate,
                        "False Positives": false_positives,
                        "False Positive Rate": false_positive_rate,
                        "False Negatives": false_negatives,
                        "False Negative Rate": false_negative_rate,
                        "F1 Score": f1_score,
                    })

                    # Only build deep dive views for selected models and strategies
                    if model in deep_dive_models and strategy in deep_dive_chunking_strategies:
                        # False Positives
                        fp_rows = df[(df[manual_tag_col] == False) & (df[model_tag_col] == True)][[
                            'chunk_id', 'chunk_text', manual_tag_col, model_tag_col, model_rel_text_col
                        ]].copy()
                        fp_rows['Transcript'] = transcript
                        fp_rows['Model'] = model
                        fp_rows['Chunking Strategy'] = strategy
                        deep_dive_data['false_positives'].append(fp_rows)

                        # False Negatives
                        fn_rows = df[(df[manual_tag_col] == True) & (df[model_tag_col] == False)][[
                            'chunk_id', 'chunk_text', manual_tag_col, model_tag_col, 'manual_relevant_text'
                        ]].copy()
                        fn_rows['Transcript'] = transcript
                        fn_rows['Model'] = model
                        fn_rows['Chunking Strategy'] = strategy
                        deep_dive_data['false_negatives'].append(fn_rows)

    full_summary_df = pd.DataFrame(full_summary_data)
    full_summary_df = full_summary_df.sort_values(by=["Model", "Chunking Strategy", "Transcript"])

    # Create Model Level Summary DF across all the transcripts
    ## Pull out the relevant columns 
    aggregate_summary_df = full_summary_df[
        ['Model', 'Chunking Strategy', 'Total Chunks', 'Correct Classifications', 'True Positives', 'True Negatives', 'False Positives', 'False Negatives']
    ]
    ## Group by the Model and Chunking Strategy columns and aggregate the counting numbers
    aggregate_summary_df = aggregate_summary_df.groupby(['Model', 'Chunking Strategy']).sum()
    ## Calculate the statistics
    aggregate_summary_df['Accuracy'] = aggregate_summary_df['Correct Classifications'] / aggregate_summary_df['Total Chunks']
    aggregate_summary_df['True Positive Rate'] = aggregate_summary_df['True Positives'] / (aggregate_summary_df['True Positives'] + aggregate_summary_df['False Negatives'])
    aggregate_summary_df['True Negative Rate'] = aggregate_summary_df['True Negatives'] / (aggregate_summary_df['True Negatives'] + aggregate_summary_df['False Positives'])
    aggregate_summary_df['False Positive Rate'] = aggregate_summary_df['False Positives'] / (aggregate_summary_df['False Positives'] + aggregate_summary_df['True Negatives'])
    aggregate_summary_df['False Negative Rate'] = aggregate_summary_df['False Negatives'] / (aggregate_summary_df['True Positives'] + aggregate_summary_df['False Negatives'])
    aggregate_summary_df['F1 Score'] = (2 * aggregate_summary_df['True Positives']) / ( (2 * aggregate_summary_df['True Positives']) + aggregate_summary_df['False Positives'] + aggregate_summary_df['False Negatives'])

    if deep_dive_data['false_positives']:
        deep_dive_data['false_positives'] = pd.concat(deep_dive_data['false_positives'], ignore_index=True)
    else:
        deep_dive_data['false_positives'] = pd.DataFrame()

    if deep_dive_data['false_negatives']:
        deep_dive_data['false_negatives'] = pd.concat(deep_dive_data['false_negatives'], ignore_index=True)
    else:
        deep_dive_data['false_negatives'] = pd.DataFrame()

    return full_summary_df, aggregate_summary_df, deep_dive_data

# Execute Evaluation Function