# Calculate IntraClass Correlations (ICCs)

## Imports

In [None]:
import pandas as pd
import pingouin as pg
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt

## Define Global Variables

In [None]:
# Define rubric item categories
reasoning_cols = ["Engagement with Evidence", "Goal Orientation",
                  "Collaborative Decision-Making", "Strategic Planning"]
hire_cols = [
    "Cultural Responsiveness", "Parent/Community Engagement", "Academic Achievement",
    "Candidate Experience/Expertise", "Evaluation", "School Culture Fit"
]

rubric_items = reasoning_cols + hire_cols

model_scorers = ['gpt-4.1-mini', 'gpt-5', "us.anthropic.claude-sonnet-4-20250514-v1:0"]
human_scorers = ['JR', 'MF']
human_avg_score = ['human avg']

valid_scorer_names = {
    "us.anthropic.claude-sonnet-4-20250514-v1:0": 'Claude Sonnet 4',
    "gpt-4.1-mini": "GPT-4.1 mini",
    "gpt-5": "GPT-5",
    "human": "Trained Annotators",
}

codebook_types = ["zero-shot", "examples", "cot_examples"]

codebook_type_names = {
        "zero-shot": "Zero-Shot",
        "examples": "Few-Shot",
        "cot_examples": "CoT"
    }

## Load Data

In [None]:
dfs = []
for ctype in codebook_types:
    df = pd.read_csv(
        f"./scores/filtered/{ctype}_df.csv", 
        header=0, index_col=0
    )
    df["codebook type"] = ctype
    df.loc[df["Scorer Name"].isin(human_scorers), "codebook type"] = "human"
    dfs.append(df)

# Concatenate codebook files
scores_df = pd.concat(dfs, ignore_index=True).drop_duplicates()

# Add "Scoring Group" column
scores_df = scores_df[scores_df["Scorer Name"] != "human avg"]
scores_df.loc[scores_df["Scorer Name"].isin(model_scorers), "Scoring Group"] = "model"
scores_df.loc[scores_df["Scorer Name"].isin(human_scorers), "Scoring Group"] = "human"

scores_df['Scorer Name'].unique()

## Calculate Human-Human ICC

In [None]:
def compute_human_icc(scores_df):
    """
    Compute ICC(2,1) for human scorers per rubric item, 
    and average ICCs for Reasoning and Hiring Priorities categories.
    
    Parameters
    ----------
    scores_df : pd.DataFrame
        DataFrame containing scores with columns 'record_id', 'Scorer Name', 'Scoring Group', and rubric items
    rubric_items : list
        List of all rubric items to compute ICC for
    reasoning_cols : list
        List of rubric items corresponding to Reasoning
    hire_cols : list
        List of rubric items corresponding to Hiring Priorities
    
    Returns
    -------
    icc_human_df : pd.DataFrame
        DataFrame with ICC(2,1) and 95% CI for each rubric item
    human_iccs : dict
        Dictionary with averaged ICCs for 'Reasoning' and 'Hiring Priorities'
    """
    
    icc_human_results = []

    for item in rubric_items:
        # Filter for human scorers and the current item
        group_human = scores_df[scores_df["Scoring Group"] == "human"][["record_id", "Scorer Name", item]].dropna()
        group_human_long = group_human.rename(columns={item: "score", "Scorer Name": "rater"})
        
        # Only compute ICC if enough raters and records
        if group_human_long["rater"].nunique() > 1 and group_human_long["record_id"].nunique() > 1:
            icc_human = pg.intraclass_corr(
                data=group_human_long, targets='record_id', raters='rater', ratings='score'
            )
            icc2 = icc_human[icc_human["Type"] == "ICC2"]
            if not icc2.empty:
                icc_human_results.append({
                    "Rubric Item": item,
                    "ICC(2,1)": icc2["ICC"].values[0],
                    "CI95%": icc2["CI95%"].values[0]
                })
    
    # Convert results to DataFrame
    icc_human_df = pd.DataFrame(icc_human_results)

    
    # Compute averages by category
    icc_human_reason_avg = icc_human_df.loc[
        icc_human_df["Rubric Item"].isin(reasoning_cols), "ICC(2,1)"
    ].mean()
    
    icc_human_hire_avg = icc_human_df.loc[
        icc_human_df["Rubric Item"].isin(hire_cols), "ICC(2,1)"
    ].mean()
    
    human_iccs = {
        "Reasoning": icc_human_reason_avg,
        "Hiring Priorities": icc_human_hire_avg
    }
    
    return icc_human_df, human_iccs

icc_human_df, human_iccs = compute_human_icc(scores_df)

## Calculate Model-Human ICC

In [None]:
def compute_model_icc(scores_df):
    """
    Compute ICC(2,1) for model raters against human averages, grouped by codebook type and rubric item.

    Parameters
    ----------
    scores_df : pd.DataFrame
        DataFrame containing scores with columns 'record_id', 'Scorer Name', 'Scoring Group', 'codebook type', and rubric items
    rubric_items : list
        List of all rubric items to compute ICC for
    codebook_types : list
        List of codebook types to compute ICC for
    valid_scorer_names : dict, optional
        Mapping to rename raters
    codebook_type_names : dict, optional
        Mapping to rename codebook types

    Returns
    -------
    icc_summary_df : pd.DataFrame
        DataFrame with ICC(2,1) and 95% CI per rater, rubric item, and codebook type
    """

    all_icc_results = []

    # Loop through each codebook type
    for ctype in codebook_types:
        df_sub = scores_df[(scores_df["codebook type"] == ctype) | (scores_df["codebook type"] == "human")]

        # Loop through each rubric item
        for item in rubric_items:

            # Group Human averages per subject
            group_human_means = (
                df_sub[df_sub["Scoring Group"] == "human"]
                .groupby("record_id")[item]
                .mean()
                .reset_index()
                .rename(columns={item: "group_human_avg"})
            )

            # Group Model individual rater scores
            group_model = df_sub[df_sub["Scoring Group"] == "model"][["record_id", "Scorer Name", item]].dropna()

            # Merge Group Model with Group Human averages
            merged = pd.merge(group_model, group_human_means, on="record_id", how="inner").dropna()

            for rater in merged["Scorer Name"].unique():
                rater_data = merged[merged["Scorer Name"] == rater][
                    ["record_id", item, "group_human_avg"]
                ].dropna()

                # Melt to long format for pingouin
                icc_data = rater_data.melt(
                    id_vars="record_id",
                    value_vars=[item, "group_human_avg"],
                    var_name="rater",
                    value_name="score"
                )

                icc = pg.intraclass_corr(
                    data=icc_data, targets="record_id", raters="rater", ratings="score"
                )
                icc2 = icc[icc["Type"] == "ICC2"]

                all_icc_results.append({
                    "Codebook Type": ctype,
                    "Rubric Item": item,
                    "Rater": rater,
                    "ICC(2,1)": icc2["ICC"].values[0],
                    "CI95%": icc2["CI95%"].values[0]
                })

    # Convert results to DataFrame
    icc_summary_df = pd.DataFrame(all_icc_results)

    # Rename raters and codebook types
    if valid_scorer_names:
        icc_summary_df["Rater"] = icc_summary_df["Rater"].replace(valid_scorer_names)
    if codebook_type_names:
        icc_summary_df["Codebook Type"] = icc_summary_df["Codebook Type"].replace(codebook_type_names)

    return icc_summary_df

icc_summary_df = compute_model_icc(scores_df)

# Save combined ICC results
icc_summary_df.to_csv("output/consistency/icc/all_ICC_results.csv", index=False)