## Helper Functions (for Step 4)

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

import os
import requests
import json
import glob

import warnings
warnings.filterwarnings("ignore")


### Define Global Vars

In [None]:
reasoning_cols = ["Engagement with Evidence", "Goal Orientation",
                  "Collaborative Decision-Making", "Strategic Planning"]
hire_cols = ["Cultural Responsiveness", "Parent/Community Engagement", "Academic Achievement", 
                "Candidate Experience/Expertise", "Evaluation", "School Culture Fit"]

cols =  reasoning_cols + hire_cols

### Functions: Concatenate LLM Scores into single dataframe

In [None]:
def concat_raw_scores(model, path):
    csv_files = glob.glob(f'{path}/{model}/{model}_*.csv') 
    
    # Read and concatenate into one DataFrame
    df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
    
    # Add a column with the number of iterations per record_id: 
    df['iteration'] = df.groupby(['record_id', "Scorer Name"]).cumcount() + 1
    df.drop("call_number", axis=1)

    return df

In [None]:
def convert_vals_numeric(training_df):
    # Convert column values to numeric
    training_df.loc[:, cols] = training_df[cols].apply(pd.to_numeric, errors='coerce')
    
    # Convert "record_id" column to numeric
    training_df.loc[:, ["record_id"]] = training_df[["record_id"]].apply(pd.to_numeric)
    
    return training_df

### Functions: Handle Invalid Values

In [None]:
def handle_invalid_values(training_df):
 
    # Convert invalid hire col values to NaN
    training_df[hire_cols] = training_df[hire_cols].map(lambda x: x if x in [0, 1] else np.nan)
    
    # Convert invalid reasoning col values to NaN
    training_df[reasoning_cols] = training_df[reasoning_cols].map(
        lambda x: x if pd.notna(x) and isinstance(x, (int, float, np.integer, np.floating)) and x == int(x) and 1 <= x <= 4 else np.nan
    )
    
    return training_df

### Functions: Concatenate Trained Annotators Dataframe with LLM Scoring Dataframe

In [None]:
# Load Human Coders dataframe
def load_human_coders():
    # Read in scorer #1 responses
    mf_df = pd.read_excel('../../data/Interviews/MF Scoring.xlsx')
    mf_df = mf_df[0:]
    mf_df = mf_df.fillna(0)

    # Read in scorer #2 responses
    jr_df = pd.read_excel('../../data/Interviews/JR Scoring.xlsx')
    jr_df = jr_df[0:]
    jr_df = jr_df.fillna(0)
    
    human_df = pd.concat([mf_df, jr_df])

    return human_df

# Create avg. human coder scoring df
def create_avg_human_df(human_df):

    human_avg_df = human_df[["record_id"] + cols].groupby(["record_id"], as_index=False).mean()
    human_avg_df[cols] = np.floor(human_avg_df[cols] + 0.5)
    
    human_avg_df["Scorer Name"] = "human avg"

    return human_avg_df

# Concat LLM + Human Coder scoring dataframe
def concat_llm_human_df(training_df, rounded=True, is_training_data=False):

    # Concat human coders dataframes
    human_df = load_human_coders()
    human_avg_df = create_avg_human_df(human_df)
    
    # Use training_df averages
    training_avg_df = training_df[["record_id", "Scorer Name"] + cols].groupby(["record_id", "Scorer Name"], as_index=False).mean()
    if rounded:
        training_avg_df[cols] = np.floor(training_avg_df[cols] + 0.5)

    # Append responses in single dataframe
    concat_df = pd.concat([training_avg_df, human_df, human_avg_df])
    if is_training_data:
        concat_df = concat_df[concat_df['record_id'].isin(training_ids)]
    else: 
        concat_df = concat_df[~concat_df['record_id'].isin(training_ids)]
    
    # Only keep columns we're interested in
    concat_df = concat_df[["record_id", "Scorer Name"] + cols]
    concat_df = concat_df.sort_values(by=['record_id', 'Scorer Name'])
    
    filtered_df = concat_df
    
    return filtered_df

### Functions: Evaluate Compliance

In [None]:
# Compliance Checks
#     - Failed calls: How many calls failed? 
#     - ID hallucinations: Are 55 respondents scored? Any fake ones?
#     - Scorer Names: Are the only scorer names clade, gpt, etc.?
#     - Reasoning scores: Bounded by 1-5? Missingness?
#     - Preferences: Bounded by 0-2? Missingness?
#     - Hire Factors: Bounded by 0-1? Missingness?

def run_compliance_checks(df, valid_ids, codebook_type="zero-shot"):
    # Define expected scorer names
    valid_scorer_names = {
        "us.anthropic.claude-sonnet-4-20250514-v1:0": "sonnet-4",
        "gpt-4.1-mini": "gpt-41-mini",
        "gpt-5": "gpt-5",
    }
    
    # Initialize result structure
    results = {
        "gpt-5": [],
        "sonnet-4": [],
        "gpt-41-mini": []
    }

    # Group dataframe by scorer name
    grouped = df.groupby("Scorer Name")

    for scorer_fullname, group in grouped:
        scorer_shortname = valid_scorer_names.get(scorer_fullname)
        if scorer_shortname is None:
            continue  # Skip any unknown scorer names

        # Check 1: Only valid scorer names present
        all_names_valid = set(df["Scorer Name"]).issubset(valid_scorer_names.keys())
        # print(df["Scorer Name"].unique())
        results[scorer_shortname].append(all_names_valid)

        # Check 2: Proportion of rows (93 responses)
        num_respondents = len(valid_ids)
        row_count_proportion = (len(group) / (num_respondents * 10))*100
        results[scorer_shortname].append(row_count_proportion)

        # Check 3: Number of invalid record_ids
        invalid_record_ids = (~group["record_id"].isin(valid_ids)).sum()
        results[scorer_shortname].append(invalid_record_ids)

        # Check 4: Reasoning (1 to 4, missingness)
        reasoning_series = group[reasoning_cols]
        missing_reasoning = reasoning_series.isna().any(axis=1)
        valid_reasoning = reasoning_series.apply(lambda col: col.map(lambda x: x in [1, 2, 3, 4])).all(axis=1)
        invalid_reasoning = ~valid_reasoning & ~missing_reasoning

        total = len(reasoning_series)
        results[scorer_shortname].append(valid_reasoning.sum() / total)
        results[scorer_shortname].append(missing_reasoning.sum() / total)
        results[scorer_shortname].append(invalid_reasoning.sum() / total)

        # Check 5: Hiring columns (0 or 1, missingness)
        hire_cols = [
            "Cultural Responsiveness",
            "Parent/Community Engagement",
            "Academic Achievement",
            "Candidate Experience/Expertise",
            "Evaluation",
            "School Culture Fit"
        ]
        hire_data = group[hire_cols]
        missing_hire = hire_data.isna().any(axis=1)
        valid_hire = hire_data.apply(lambda col: col.map(lambda x: x in [0, 1])).all(axis=1)
        invalid_hire = ~(valid_hire | missing_hire)

        total = len(hire_data)
        results[scorer_shortname].append(valid_hire.sum() / total)
        results[scorer_shortname].append(missing_hire.sum() / total)
        results[scorer_shortname].append(invalid_hire.sum() / total)


        compliance_df = pd.DataFrame.from_dict(results, orient="index", columns=[
            "Valid Scorer Names",
            "Row Count Proportion",
            "ID Hallucinations",
            # Check 4
            "Reasoning: Valid",
            "Reasoning: Missing",
            "Reasoning: Invalid",
            # Check 5
            "Hire Factors: Valid",
            "Hire Factors: Missing",
            "Hire Factors: Invalid"
        ])

    compliance_df = compliance_df.round(3)
    compliance_df.to_csv(f"output/compliance/{codebook_type}_compliance_df.csv")
    
    return compliance_df

### Functions: Evaluate Variation

In [None]:
def calc_variation(df):
    # Calculate means and std
    results = []
    
    for col in cols:
        # Calculate mean and standard deviation by record_id
        consistency_df = df.groupby("record_id")[col].agg(['mean', 'std']).reset_index()
    
        consistency_df['std'] = consistency_df['std'].fillna(0)
        # Calculate average std. dev. and average CV
        average_std = consistency_df['std'].mean()
        
        # Calculate overall mean and std_dev of rubric item
        overall_mean = df[col].mean()
        overall_std = df[col].std()
        
        # Append results
        results.append({
            "Rubric Item": col,
            "Overall Mean": overall_mean,
            "Overall SD (across resp.)": overall_std,
            "Avg. SD (within resp.)": average_std
        })
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(results)

    # @TODO careful of rounding here - pandas turns 1.5 into 1
    summary_df = summary_df.round(3)
    return summary_df

def save_variation_df(training_df, codebook_type="zero-shot"):

    # Define models and their display names
    model_labels = {
        "gpt-5": "GPT-5",
        "sonnet-4": "Sonnet 4",
        "gpt-41-mini": "GPT 41 Mini"
    }
    
    # Compute variation for each model
    variation_dict = {
        label: calc_variation(training_df[training_df["Scorer Name"] == engines[key]])
        for key, label in model_labels.items()
    }
    
    # Combine all into one dataframe
    variation_df = pd.concat(variation_dict, axis=1)

    variation_df.to_csv(f"output/variation/{codebook_type}_variation_df.csv")
    return variation_df

### Functions: Evaluate Uncertainty 

In [None]:
def build_testset_df(codebook_types, training_ids):
    all_testsets = []

    for codebook_type in codebook_types:
        gpt_5_df = concat_raw_scores("gpt-5", path=f"scores/{codebook_type}")
        sonnet_4_df = concat_raw_scores("sonnet-4", path=f"scores/{codebook_type}")
        gpt_41_mini_df = concat_raw_scores("gpt-41-mini", path=f"scores/{codebook_type}")

        testing_df = pd.concat([gpt_5_df, sonnet_4_df, gpt_41_mini_df])
        testing_df = testing_df[~testing_df["record_id"].isin(training_ids)]

        testing_df = convert_vals_numeric(testing_df)
        testing_df = handle_invalid_values(testing_df)

        testing_df["codebook_type"] = codebook_type
        all_testsets.append(testing_df)

    testset_df = pd.concat(all_testsets, ignore_index=True)

    return testset_df



In [None]:
def compute_entropy_pivot(
    testset_df,
    rubric_items,
    reasoning_cols,
    hire_cols,
    scorer_name_map,
    scorer_order,
):
    # Ensure numeric
    testset_df[rubric_items] = testset_df[rubric_items].apply(
        pd.to_numeric, errors="coerce"
    )

    def compute_entropy(scores):
        scores = scores.dropna()
        if len(scores) == 0:
            return np.nan
        probs = scores.value_counts(normalize=True).values
        return -np.sum(probs * np.log(probs))

    long_df = testset_df.melt(
        id_vars=["record_id", "Scorer Name", "codebook_type"],
        value_vars=rubric_items,
        var_name="Rubric Item",
        value_name="Score",
    )

    entropy_df = (
        long_df
        .groupby(
            ["record_id", "Scorer Name", "Rubric Item", "codebook_type"]
        )["Score"]
        .apply(compute_entropy)
        .reset_index(name="Entropy")
    )

    # Max entropy per item
    max_entropy_map = {item: np.log(4) for item in reasoning_cols}
    max_entropy_map.update({item: np.log(2) for item in hire_cols})

    entropy_df["Normalized_Entropy"] = entropy_df.apply(
        lambda r: r["Entropy"] / max_entropy_map[r["Rubric Item"]],
        axis=1,
    )

    avg_entropy_df = (
        entropy_df
        .groupby(["Scorer Name", "Rubric Item", "codebook_type"])["Normalized_Entropy"]
        .mean()
        .reset_index()
        .rename(columns={"Normalized_Entropy": "Entropy"})
    )

    avg_entropy_df["Category"] = np.select(
        [
            avg_entropy_df["Rubric Item"].isin(reasoning_cols),
            avg_entropy_df["Rubric Item"].isin(hire_cols),
        ],
        ["Reasoning", "Hiring"],
        default=np.nan,
    )

    avg_entropy_df_by_category = (
        avg_entropy_df
        .groupby(["Scorer Name", "Category", "codebook_type"])["Entropy"]
        .mean()
        .reset_index()
    )

    avg_entropy_df_by_category["Scorer Name"] = (
        avg_entropy_df_by_category["Scorer Name"]
        .replace(scorer_name_map)
    )

    pivot_df = (
        avg_entropy_df_by_category
        .pivot_table(
            index="Scorer Name",
            columns=["Category", "codebook_type"],
            values="Entropy",
        )
        .reindex(scorer_order)
        .round(2)
    )
    pivot_df.to_csv("output/uncertainty/entropy_scores.csv")
    return pivot_df
