# Import Dependencies

In [2]:
import pandas as pd
import os
import glob

# Define Evaluation Function

In [85]:
def evaluate_model_performance(
    data_dir:str = '../data',
    topic:str = 'Information Technology',
    model_names:list = ['gpt-4.1-nano', 'gpt-4.1-mini'],
    chunking_strategies:list = ['recursive-character', 'semantic-percentile'],
    transcripts:list = ['ded', 'dss-non-medicaid-pt2', 'sa-dolir'],
    deep_dive_models:list= ['gpt-4.1-mini'],
    deep_dive_chunking_strategies=['recursive-character']
):
    full_summary_data = []
    deep_dive_df = pd.DataFrame()

    for transcript in transcripts:
        for strategy in chunking_strategies:
            # Construct search path for matching CSVs
            search_path = f'{data_dir}/{transcript}-chunked-{strategy}-tagged.csv'
            matching_files = glob.glob(search_path)

            for file_path in matching_files:
                df = pd.read_csv(file_path)
  
                for model in model_names:
                    manual_tag_col = f"manual_{topic}_tag"
                    model_tag_col = f"{model}_{topic}_tag"
                    model_conf_col = f"{model}_{topic}_tag_confidence"
                    model_rel_text_col = f"{model}_{topic}_tag_relevant_section"

                    if manual_tag_col not in df.columns or model_tag_col not in df.columns:
                        continue  # Skip if necessary columns are missing

                    total = len(df)
                    correct = (df[manual_tag_col] == df[model_tag_col]).sum()
                    true_negatives = ((df[manual_tag_col] == False) & (df[model_tag_col] == False)).sum()
                    false_positives = ((df[manual_tag_col] == False) & (df[model_tag_col] == True)).sum()
                    false_negatives = ((df[manual_tag_col] == True) & (df[model_tag_col] == False)).sum()
                    true_positives = ((df[manual_tag_col] == True) & (df[model_tag_col] == True)).sum()

                    true_positive_rate = true_positives / (true_positives + false_negatives)
                    true_negative_rate = true_negatives / (true_negatives + false_positives)
                    false_positive_rate = false_positives / (false_positives + true_negatives)
                    false_negative_rate = false_negatives / (true_positives + false_negatives)
                    f1_score = (2 * true_positives) / ( (2 * true_positives) + false_positives + false_negatives)
  
                    full_summary_data.append({
                        "Transcript": transcript,
                        "Chunking Strategy": strategy,
                        "Model": model,
                        "Total Chunks": total,
                        "Correct Classifications": correct,
                        "Accuracy Rate": correct / total if total else None,
                        "True Positives": true_positives,
                        "True Positive Rate": true_positive_rate,
                        "True Negatives": true_negatives,
                        "True Negative Rate": true_negative_rate,
                        "False Positives": false_positives,
                        "False Positive Rate": false_positive_rate,
                        "False Negatives": false_negatives,
                        "False Negative Rate": false_negative_rate,
                        "F1 Score": f1_score,
                    })

                    # Only build deep dive views for selected models and strategies
                    if model in deep_dive_models and strategy in deep_dive_chunking_strategies:
                        tgt_cols = ['chunk_id', 'chunk_text', manual_tag_col, 'manual_relevant_text', model_tag_col,  model_rel_text_col]
                        # False Positives
                        fp_rows = df[(df[manual_tag_col] == False) & (df[model_tag_col] == True)][tgt_cols].copy()
                        fp_rows['Transcript'] = transcript
                        # fp_rows['Model'] = model
                        # fp_rows['Chunking Strategy'] = strategy
                        # fp_rows['Classification Type'] = 'False Positive'

                        # False Negatives
                        fn_rows = df[(df[manual_tag_col] == True) & (df[model_tag_col] == False)][tgt_cols].copy()
                        fn_rows['Transcript'] = transcript
                        # fn_rows['Model'] = model
                        # fn_rows['Chunking Strategy'] = strategy
                        # fp_rows['Classification Type'] = 'False Negative'

                        # True Positives
                        tp_rows = df[(df[manual_tag_col] == True) & (df[model_tag_col] == True)][tgt_cols].copy()
                        tp_rows['Transcript'] = transcript
                        # tp_rows['Model'] = model
                        # tp_rows['Chunking Strategy'] = strategy
                        # tp_rows['Classification Type'] = 'True Positive'

                        # Combine the different dfs
                        deep_dive_df = pd.concat([deep_dive_df, fp_rows, fn_rows, tp_rows], axis=0)

    full_summary_df = pd.DataFrame(full_summary_data)
    full_summary_df = full_summary_df.sort_values(by=["Model", "Chunking Strategy", "Transcript"])

    # Create Model Level Summary DF across all the transcripts
    ## Pull out the relevant columns 
    aggregate_summary_df = full_summary_df[
        ['Model', 'Chunking Strategy', 'Total Chunks', 'Correct Classifications', 'True Positives', 'True Negatives', 'False Positives', 'False Negatives']
    ]
    ## Group by the Model and Chunking Strategy columns and aggregate the counting numbers
    aggregate_summary_df = aggregate_summary_df.groupby(['Model', 'Chunking Strategy']).sum()
    ## Calculate the statistics
    aggregate_summary_df['Accuracy'] = aggregate_summary_df['Correct Classifications'] / aggregate_summary_df['Total Chunks']
    aggregate_summary_df['True Positive Rate'] = aggregate_summary_df['True Positives'] / (aggregate_summary_df['True Positives'] + aggregate_summary_df['False Negatives'])
    aggregate_summary_df['True Negative Rate'] = aggregate_summary_df['True Negatives'] / (aggregate_summary_df['True Negatives'] + aggregate_summary_df['False Positives'])
    aggregate_summary_df['False Positive Rate'] = aggregate_summary_df['False Positives'] / (aggregate_summary_df['False Positives'] + aggregate_summary_df['True Negatives'])
    aggregate_summary_df['False Negative Rate'] = aggregate_summary_df['False Negatives'] / (aggregate_summary_df['True Positives'] + aggregate_summary_df['False Negatives'])
    aggregate_summary_df['F1 Score'] = (2 * aggregate_summary_df['True Positives']) / ( (2 * aggregate_summary_df['True Positives']) + aggregate_summary_df['False Positives'] + aggregate_summary_df['False Negatives'])

    return full_summary_df, aggregate_summary_df, deep_dive_df

# Execute Evaluation Function

In [86]:
full_summary_df, aggregate_summary_df, deep_dive_df = evaluate_model_performance()

In [70]:
full_summary_df

Unnamed: 0,Transcript,Chunking Strategy,Model,Total Chunks,Correct Classifications,Accuracy Rate,True Positives,True Positive Rate,True Negatives,True Negative Rate,False Positives,False Positive Rate,False Negatives,False Negative Rate,F1 Score
1,ded,recursive-character,gpt-4.1-mini,52,50,0.961538,0,0.0,50,0.980392,1,0.019608,1,1.0,0.0
5,dss-non-medicaid-pt2,recursive-character,gpt-4.1-mini,496,475,0.957661,10,0.909091,465,0.958763,20,0.041237,1,0.090909,0.487805
9,sa-dolir,recursive-character,gpt-4.1-mini,67,62,0.925373,3,0.75,59,0.936508,4,0.063492,1,0.25,0.545455
3,ded,semantic-percentile,gpt-4.1-mini,18,17,0.944444,1,1.0,16,0.941176,1,0.058824,0,0.0,0.666667
7,dss-non-medicaid-pt2,semantic-percentile,gpt-4.1-mini,89,77,0.865169,4,0.5,73,0.901235,8,0.098765,4,0.5,0.4
11,sa-dolir,semantic-percentile,gpt-4.1-mini,23,22,0.956522,3,1.0,19,0.95,1,0.05,0,0.0,0.857143
0,ded,recursive-character,gpt-4.1-nano,52,44,0.846154,0,0.0,44,0.862745,7,0.137255,1,1.0,0.0
4,dss-non-medicaid-pt2,recursive-character,gpt-4.1-nano,496,373,0.752016,9,0.818182,364,0.750515,121,0.249485,2,0.181818,0.12766
8,sa-dolir,recursive-character,gpt-4.1-nano,67,47,0.701493,4,1.0,43,0.68254,20,0.31746,0,0.0,0.285714
2,ded,semantic-percentile,gpt-4.1-nano,18,14,0.777778,0,0.0,14,0.823529,3,0.176471,1,1.0,0.0


In [71]:
aggregate_summary_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Chunks,Correct Classifications,True Positives,True Negatives,False Positives,False Negatives,Accuracy,True Positive Rate,True Negative Rate,False Positive Rate,False Negative Rate,F1 Score
Model,Chunking Strategy,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4.1-mini,recursive-character,615,587,13,574,25,3,0.954472,0.8125,0.958264,0.041736,0.1875,0.481481
gpt-4.1-mini,semantic-percentile,130,116,8,108,10,4,0.892308,0.666667,0.915254,0.084746,0.333333,0.533333
gpt-4.1-nano,recursive-character,615,464,13,451,148,3,0.754472,0.8125,0.752922,0.247078,0.1875,0.146893
gpt-4.1-nano,semantic-percentile,130,97,10,87,31,2,0.746154,0.833333,0.737288,0.262712,0.166667,0.377358


In [87]:
deep_dive_df.to_csv('../data/deep_dive_evals.csv', index=False)