# IntraClass Correlation

## Imports

In [None]:
import pandas as pd
import pingouin as pg
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt

## Define Global Variables

In [None]:
# Define rubric item categories
reasoning_cols = ["Engagement with Evidence", "Goal Orientation",
                  "Collaborative Decision-Making", "Strategic Planning"]
hire_cols = [
    "Cultural Responsiveness", "Parent/Community Engagement", "Academic Achievement",
    "Candidate Experience/Expertise", "Evaluation", "School Culture Fit"
]

rubric_items = reasoning_cols + hire_cols

# Set up data to remove "human avg" column, and add an extra column called "Scoring Group" that differs model from human scorers
model_scorers = ['gpt-4.1-mini', 'gpt-5', "us.anthropic.claude-sonnet-4-20250514-v1:0"]
human_scorers = ['JR', 'MF']
human_avg_score = ['human avg']

valid_scorer_names = {
    "us.anthropic.claude-sonnet-4-20250514-v1:0": 'Claude Sonnet 4',
    "gpt-4.1-mini": "GPT-4.1 mini",
    "gpt-5": "GPT-5",
    "human": "Trained Annotators",
}


codebook_types = ["zero-shot", "examples", "cot_examples"]

codebook_type_names = {
        "zero-shot": "Zero-Shot",
        "examples": "Few-Shot",
        "cot_examples": "CoT"
    }

## Load Data

In [None]:
dfs = []
for ctype in codebook_types:
    df = pd.read_csv(
        f"./scores/filtered/{ctype}_df.csv", 
        header=0, index_col=0
    )
    df["codebook type"] = ctype
    df.loc[df["Scorer Name"].isin(human_scorers), "codebook type"] = "human"
    dfs.append(df)

# Concatenate codebook files
scores_df = pd.concat(dfs, ignore_index=True).drop_duplicates()

# Add "Scoring Group" column
scores_df = scores_df[scores_df["Scorer Name"] != "human avg"]
scores_df.loc[scores_df["Scorer Name"].isin(model_scorers), "Scoring Group"] = "model"
scores_df.loc[scores_df["Scorer Name"].isin(human_scorers), "Scoring Group"] = "human"

scores_df['Scorer Name'].unique()
# scores_df.to_csv('./scores/filtered dataframes/test_df.csv', index=False)

## Calculate Human-Human ICC

In [None]:
def compute_human_icc(scores_df):
    """
    Compute ICC(2,1) for human scorers per rubric item, 
    and average ICCs for Reasoning and Hiring Priorities categories.
    
    Parameters
    ----------
    scores_df : pd.DataFrame
        DataFrame containing scores with columns 'record_id', 'Scorer Name', 'Scoring Group', and rubric items
    rubric_items : list
        List of all rubric items to compute ICC for
    reasoning_cols : list
        List of rubric items corresponding to Reasoning
    hire_cols : list
        List of rubric items corresponding to Hiring Priorities
    
    Returns
    -------
    icc_human_df : pd.DataFrame
        DataFrame with ICC(2,1) and 95% CI for each rubric item
    human_iccs : dict
        Dictionary with averaged ICCs for 'Reasoning' and 'Hiring Priorities'
    """
    
    icc_human_results = []

    for item in rubric_items:
        # Filter for human scorers and the current item
        group_human = scores_df[scores_df["Scoring Group"] == "human"][["record_id", "Scorer Name", item]].dropna()
        group_human_long = group_human.rename(columns={item: "score", "Scorer Name": "rater"})
        
        # Only compute ICC if enough raters and records
        if group_human_long["rater"].nunique() > 1 and group_human_long["record_id"].nunique() > 1:
            icc_human = pg.intraclass_corr(
                data=group_human_long, targets='record_id', raters='rater', ratings='score'
            )
            icc2 = icc_human[icc_human["Type"] == "ICC2"]
            if not icc2.empty:
                icc_human_results.append({
                    "Rubric Item": item,
                    "ICC(2,1)": icc2["ICC"].values[0],
                    "CI95%": icc2["CI95%"].values[0]
                })
    
    # Convert results to DataFrame
    icc_human_df = pd.DataFrame(icc_human_results)

    
    # Compute averages by category
    icc_human_reason_avg = icc_human_df.loc[
        icc_human_df["Rubric Item"].isin(reasoning_cols), "ICC(2,1)"
    ].mean()
    
    icc_human_hire_avg = icc_human_df.loc[
        icc_human_df["Rubric Item"].isin(hire_cols), "ICC(2,1)"
    ].mean()
    
    human_iccs = {
        "Reasoning": icc_human_reason_avg,
        "Hiring Priorities": icc_human_hire_avg
    }
    
    return icc_human_df, human_iccs

icc_human_df, human_iccs = compute_human_icc(scores_df)

icc_human_df
human_iccs

## Calculate Model-Human ICC

In [None]:

def compute_model_icc(scores_df):
    """
    Compute ICC(2,1) for model raters against human averages, grouped by codebook type and rubric item.

    Parameters
    ----------
    scores_df : pd.DataFrame
        DataFrame containing scores with columns 'record_id', 'Scorer Name', 'Scoring Group', 'codebook type', and rubric items
    rubric_items : list
        List of all rubric items to compute ICC for
    codebook_types : list
        List of codebook types to compute ICC for
    valid_scorer_names : dict, optional
        Mapping to rename raters
    codebook_type_names : dict, optional
        Mapping to rename codebook types

    Returns
    -------
    icc_summary_df : pd.DataFrame
        DataFrame with ICC(2,1) and 95% CI per rater, rubric item, and codebook type
    """

    all_icc_results = []

    # Loop through each codebook type
    for ctype in codebook_types:
        df_sub = scores_df[(scores_df["codebook type"] == ctype) | (scores_df["codebook type"] == "human")]

        # Loop through each rubric item
        for item in rubric_items:

            # Group Human averages per subject
            group_human_means = (
                df_sub[df_sub["Scoring Group"] == "human"]
                .groupby("record_id")[item]
                .mean()
                .reset_index()
                .rename(columns={item: "group_human_avg"})
            )

            # Group Model individual rater scores
            group_model = df_sub[df_sub["Scoring Group"] == "model"][["record_id", "Scorer Name", item]].dropna()

            # Merge Group Model with Group Human averages
            merged = pd.merge(group_model, group_human_means, on="record_id", how="inner").dropna()

            for rater in merged["Scorer Name"].unique():
                rater_data = merged[merged["Scorer Name"] == rater][
                    ["record_id", item, "group_human_avg"]
                ].dropna()

                # Melt to long format for pingouin
                icc_data = rater_data.melt(
                    id_vars="record_id",
                    value_vars=[item, "group_human_avg"],
                    var_name="rater",
                    value_name="score"
                )

                icc = pg.intraclass_corr(
                    data=icc_data, targets="record_id", raters="rater", ratings="score"
                )
                icc2 = icc[icc["Type"] == "ICC2"]

                all_icc_results.append({
                    "Codebook Type": ctype,
                    "Rubric Item": item,
                    "Rater": rater,
                    "ICC(2,1)": icc2["ICC"].values[0],
                    "CI95%": icc2["CI95%"].values[0]
                })

    # Convert results to DataFrame
    icc_summary_df = pd.DataFrame(all_icc_results)

    # Rename raters and codebook types
    if valid_scorer_names:
        icc_summary_df["Rater"] = icc_summary_df["Rater"].replace(valid_scorer_names)
    if codebook_type_names:
        icc_summary_df["Codebook Type"] = icc_summary_df["Codebook Type"].replace(codebook_type_names)

    return icc_summary_df

icc_summary_df = compute_model_icc(scores_df)

# Save combined ICC results
icc_summary_df.to_csv("output/consistency/icc/all_ICC_results.csv", index=False)

icc_summary_df.head()


### Create ICC Model-Human Table

In [None]:


def create_icc_comparison_df(icc_summary_df, human_iccs, top_level_order=None, second_level_order=None, round_decimals=2):
    """
    Create a comparison DataFrame of ICCs per Rater, with multi-level columns for 
    Reasoning and Hiring Priorities and sub-columns for Codebook Types. Includes a row
    for Trained Annotators using provided human ICCs.
    
    Parameters
    ----------
    icc_summary_df : pd.DataFrame
        DataFrame with columns ['Rater', 'Codebook Type', 'Rubric Item', 'ICC(2,1)']
    reasoning_cols : list
        List of rubric items corresponding to Reasoning
    hire_cols : list
        List of rubric items corresponding to Hiring Priorities
    human_iccs : dict
        Dictionary with averaged ICCs for 'Reasoning' and 'Hiring Priorities'
    top_level_order : list, optional
        Desired order for top-level columns (default ['Reasoning', 'Hiring Priorities'])
    second_level_order : list, optional
        Desired order for second-level columns (codebook types)
    round_decimals : int, optional
        Number of decimals to round the final DataFrame
    
    Returns
    -------
    df_final : pd.DataFrame
        MultiIndex column DataFrame with ICC averages per rater and Trained Annotators
    """
    
    if top_level_order is None:
        top_level_order = ["Reasoning", "Hiring Priorities"]
    if second_level_order is None:
        second_level_order = icc_summary_df["Codebook Type"].unique()
    
    # Get unique raters and codebook types
    raters = icc_summary_df["Rater"].unique()
    codebook_types = icc_summary_df["Codebook Type"].unique()
    
    # Prepare a dict to store the data
    data = {}

    # Loop through each top-level category
    for category, cols in [("Reasoning", reasoning_cols), ("Hiring Priorities", hire_cols)]:
        data[category] = {}
        for ctype in codebook_types:
            col_values = []
            for rater in raters:
                mask = (
                    (icc_summary_df["Rater"] == rater) &
                    (icc_summary_df["Codebook Type"] == ctype) &
                    (icc_summary_df["Rubric Item"].isin(cols))
                )
                icc_vals = icc_summary_df.loc[mask, "ICC(2,1)"]
                col_values.append(icc_vals.mean())
            data[category][ctype] = col_values

    # Create MultiIndex columns
    multi_cols = pd.MultiIndex.from_product([["Reasoning", "Hiring Priorities"], codebook_types])

    # Convert to DataFrame
    df_final = pd.DataFrame(
        data = pd.concat([pd.DataFrame(data["Reasoning"]), pd.DataFrame(data["Hiring Priorities"])], axis=1).values,
        index=raters,
        columns=multi_cols
    )

    # Add the Trained Annotators row
    human_row = []
    for category in ["Reasoning", "Hiring Priorities"]:
        for ctype in codebook_types:
            human_row.append(human_iccs[category])
    df_final.loc["Trained Annotators"] = human_row

    # Round values
    df_final = df_final.round(round_decimals)

    # Reorder columns if desired
    if top_level_order and second_level_order:
        multi_cols_ordered = pd.MultiIndex.from_product([top_level_order, second_level_order])
        df_final = df_final.reindex(columns=multi_cols_ordered)

    return df_final

df_final = create_icc_comparison_df(
    icc_summary_df,
    top_level_order=["Reasoning", "Hiring Priorities"],
    second_level_order=["Zero-Shot", "Few-Shot", "CoT"],
    human_iccs=human_iccs
)

# Save to CSV
df_final.to_csv("output/consistency/icc/icc_comparison_df.csv")

df_final


## Calculate Model-Human ICC with Training Data

### Load data

In [None]:
# Load the iterations_check scores
iter_scores_df = pd.read_csv(f"./scores/filtered/training_data_df.csv")
iter_scores_df["codebook type"] = "zero-shot"
iter_scores_df.loc[iter_scores_df["Scorer Name"].isin(human_scorers), "codebook type"] = "human"

# Add "Scoring Group" column
iter_scores_df = iter_scores_df[iter_scores_df["Scorer Name"] != "human avg"]
iter_scores_df.loc[iter_scores_df["Scorer Name"].isin(model_scorers), "Scoring Group"] = "model"
iter_scores_df.loc[iter_scores_df["Scorer Name"].isin(human_scorers), "Scoring Group"] = "human"

iter_scores_df

In [None]:
icc_human_df, human_iccs = compute_human_icc(iter_scores_df)

icc_summary_df = compute_model_icc(iter_scores_df)

df_iterations = create_icc_comparison_df(
    icc_summary_df,
    top_level_order=["Reasoning", "Hiring Priorities"],
    second_level_order=["Zero-Shot"],
    human_iccs=human_iccs
)


df_iterations

In [None]:
icc_human_df, human_iccs = compute_human_icc(scores_df)

icc_summary_df = compute_model_icc(scores_df)

df_iterations = create_icc_comparison_df(
    icc_summary_df,
    top_level_order=["Reasoning", "Hiring Priorities"],
    second_level_order=["Zero-Shot"],
    human_iccs=human_iccs
)


df_iterations

## Calculate Model-Human ICC by Subsample Size

### Load Data

In [None]:
dfs = []
for ctype in codebook_types:
    df = pd.read_csv(
        f"./scores/filtered dataframes/{ctype}_df.csv", 
        header=0, index_col=0
    )
    df["codebook type"] = ctype
    df.loc[df["Scorer Name"].isin(human_scorers), "codebook type"] = "human"
    dfs.append(df)

# Concatenate codebook files
scores_df = pd.concat(dfs, ignore_index=True).drop_duplicates()

# Add "Scoring Group" column
scores_df = scores_df[scores_df["Scorer Name"] != "human avg"]
scores_df.loc[scores_df["Scorer Name"].isin(model_scorers), "Scoring Group"] = "model"
scores_df.loc[scores_df["Scorer Name"].isin(human_scorers), "Scoring Group"] = "human"


print(len(scores_df["record_id"].unique()))
scores_df.shape
# number of unique IDs (94) * number of models (6) * number of codebooks (3) + humans (94*2) = 1880 rows

scores_df.head()

### Calculate ICCs

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Parameters
subsample_percents = range(4, 50, 2)  # 5%, 10%, ..., 100%
num_repeats = 50  # number of subsampling repetitions per percentage

# Function to compute ICCs for a given subsample
def compute_iccs_for_subsample(df):
    # These functions already handle codebook_type and rubric_item internally
    icc_human_df, human_iccs = compute_human_icc(df)
    print(human_iccs)
    icc_summary_df = compute_model_icc(df)
    df_iterations = create_icc_comparison_df(
        icc_summary_df,
        top_level_order=["Reasoning", "Hiring Priorities"],
        second_level_order=["Zero-Shot", "Few-Shot", "CoT"],
        human_iccs=human_iccs
    )
    return df_iterations

# Collect results
results = []

for pct in subsample_percents:
    n_ids = int(scores_df["record_id"].nunique() * pct / 100)

    for rep in range(num_repeats):
        sampled_ids = (
            scores_df["record_id"]
            .drop_duplicates()
            .sample(n=n_ids, replace=False, random_state=rep)
        )
        subsample_df = scores_df[scores_df["record_id"].isin(sampled_ids)]
        print("PCT of subsamples: ", pct)
        print("number of IDs: ", len(subsample_df["record_id"].unique()))
        
        icc_df = compute_iccs_for_subsample(subsample_df)
        icc_long = (
            icc_df
            .stack(level=[0, 1])  # stack both levels of columns
            .reset_index()        # bring index and stacked levels into columns
            .rename(columns={icc_df.index.name or 'index': 'Annotator', 0: 'ICC', 'level_1': 'Rubric', 'level_2': 'Codebook'})
        )

        icc_long["Subsample"] = pct
        icc_long["Repeat"] = rep
        results.append(icc_long)

# Combine all results
results_df = pd.concat(results, ignore_index=True)

In [None]:
def mean_ci(x):
    mean = np.mean(x)
    ci = 1.96 * np.std(x, ddof=1) / np.sqrt(len(x))
    return pd.Series({'mean': mean, 'lower': mean - ci, 'upper': mean + ci})

# Aggregate across repeats using apply
agg_results = (
    results_df
    .groupby(['Subsample', 'level_0', 'Rubric', 'Codebook'])['ICC']
    .apply(mean_ci)
    .reset_index()
)

agg_results_wide = agg_results.pivot_table(
    index=['Subsample', 'level_0', 'Rubric', 'Codebook'],
    columns='level_4',
    values='ICC'
).reset_index()

# agg_results
agg_results_wide
agg_results_wide.to_csv("output/consistency/icc/iteration_check_iccs_4_50_2.csv")

### Plot Results

In [None]:
agg_results_wide = pd.read_csv(f"./output/consistency/icc/iteration_check_iccs_4_50_2.csv", header=0, index_col=0)
agg_results_wide

In [None]:
model_order = [
    "GPT-5",
    "GPT-4.1 mini",
    "Claude Sonnet 4",
    "Trained Annotators"
]
color_map = {
    "GPT-5": "#426737",
    "GPT-4.1 mini": "#bab97d",
    "Claude Sonnet 4": "#f9b40d",
    "Trained Annotators": "#000",
    "Human Scorers": "#000",
}

In [None]:

full_sample_iccs = pd.read_csv(f"./output/consistency/icc/icc_comparison_df.csv", header=[0, 1], index_col=0)

full_sample_iccs_map = {
    "GPT-5": 3,
    "GPT-4o": 2,
    "GPT-4.1 mini": 1,
    "Claude Sonnet 4": 5,
    "Claude 3.5 Haiku": 4,
    "Amazon Nova Lite": 0,
    "Trained Annotators": 6
}

# --- Compute "within 10%" per model-codebook-rubric combination ---
summary_rows = []

for rubric in agg_results_wide['Rubric'].unique():
    for codebook in agg_results_wide['Codebook'].unique():

        # Subset to this rubric & codebook
        subset_rc = agg_results_wide[
            (agg_results_wide['Rubric'] == rubric) &
            (agg_results_wide['Codebook'] == codebook)
        ]

        # Loop over each model
        for model in subset_rc['level_0'].unique():
            full_icc = full_sample_iccs[(rubric, codebook)].iloc[full_sample_iccs_map[model]]
            sub = subset_rc[subset_rc['level_0'] == model].copy()
            sub['diff_pct'] = (sub['mean'] - full_icc).abs() / full_icc

            within_10 = sub[sub['diff_pct'] <= 0.10]
            within_5 = sub[sub['diff_pct'] <= 0.05]
            min_subsample_10 = within_10['Subsample'].min()
            min_subsample_5 = within_5['Subsample'].min()
            summary_rows.append({
                'Rubric': rubric,
                'Codebook': codebook,
                'Model': model,
                'Full ICC': full_icc,
                'Min Subsample (10%)': min_subsample_10,
                'Min Subsample (5%)': min_subsample_5
            })

summary_df = pd.DataFrame(summary_rows)


summary_df = summary_df[summary_df['Model'] != 'Trained Annotators']
grouped_summary_df = summary_df.groupby(['Rubric', 'Codebook']).mean(['Min Subsample (10%)', 'Min Subsample (5%)'])
grouped_summary_df


In [None]:
# Use IndexSlice for multi-level indexing
idx = pd.IndexSlice

combined_rows = []

for rubric in grouped_summary_df.index.get_level_values('Rubric').unique():
    # Select all codebooks for this rubric
    subset = grouped_summary_df.loc[idx[rubric, ['CoT', 'Few-Shot', 'Zero-Shot']], :]
    
    # Compute mean across codebooks for this rubric
    mean_full_icc = subset['Full ICC'].mean()
    mean_min_10_sub = subset['Min Subsample (10%)'].mean()
    mean_min_5_sub = subset['Min Subsample (5%)'].mean()
    
    # Append a tuple for MultiIndex
    combined_rows.append(((rubric, 'Combined'), {'Full ICC': mean_full_icc,
                                                 'Min Subsample (10%)': mean_min_10_sub,
                                                 'Min Subsample (5%)': mean_min_5_sub}))

# Convert to DataFrame and append
combined_df = pd.DataFrame(
    [row[1] for row in combined_rows],
    index=pd.MultiIndex.from_tuples([row[0] for row in combined_rows],
                                    names=grouped_summary_df.index.names)
)

grouped_summary_df = pd.concat([grouped_summary_df, combined_df]).sort_index()

grouped_summary_df


In [None]:
def plot_icc_subplots(data, codebook, color_map, model_order, show_CI=False, grouped_summary_df=None):
    
    rubric_order = ["Reasoning", "Hiring Priorities"]  
    rubrics = [r for r in rubric_order if r in data['Rubric'].unique()]
    n_rubrics = len(rubrics)
    
    fig, axes = plt.subplots(1, n_rubrics, figsize=(6*n_rubrics, 6), sharey=True)
    
    if n_rubrics == 1:  # make sure axes is iterable
        axes = [axes]
    
    for ax, rubric in zip(axes, rubrics):
        subset = data[data['Rubric'] == rubric]
        
        for annotator in model_order:
            if annotator in subset['level_0'].values:
                sub = subset[subset['level_0'] == annotator]

                if annotator == "Trained Annotators":
                    annotator = "Human Scorers"
                
                ax.plot(sub['Subsample'], sub['mean'],
                        marker='o', label=annotator,
                        color=color_map[annotator])
                if show_CI:
                    ax.fill_between(sub['Subsample'],
                                    sub['lower'], sub['upper'],
                                    alpha=0.2, color=color_map[annotator])
        
        # --- Add vertical line for mean "Min Subsample (%)" ---
        if grouped_summary_df is not None:
            print('rubric', rubric, 'codebook', codebook)
            try:
                min_sub_10_mean = grouped_summary_df.loc[(rubric, codebook), 'Min Subsample (10%)']
                min_sub_5_mean = grouped_summary_df.loc[(rubric, codebook), 'Min Subsample (5%)']
                print('min_sub 10: ', min_sub_10_mean)
                print('min_sub 1: ', min_sub_5_mean)
                ax.axvline(x=min_sub_10_mean, color='grey', linestyle='--', linewidth=1.5, label='10% threshold')
                ax.text(min_sub_10_mean, 0.85,
                        f"{min_sub_10_mean:.0f}%", rotation=0,
                        va='top', ha='right', color='grey', fontsize=10)
                ax.axvline(x=min_sub_5_mean, color='black', linestyle='--', linewidth=1.5, label='5% threshold')
                ax.text(min_sub_5_mean, 0.85,
                        f"{min_sub_5_mean:.0f}%", rotation=0,
                        va='top', ha='right', color='black', fontsize=10)
            except KeyError:
                # No entry for this (rubric, codebook)
                pass
        
        ax.set_title(f"{rubric}")
        ax.set_xlabel('% of Responses')
        ax.set_ylabel('ICC')
        ax.grid(True)
    
    # Legend & title
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, title='Model',
               loc='center left', bbox_to_anchor=(0.9, 0.5))
    
    # fig.suptitle(f'ICC vs Subsample Size ({codebook})', fontsize=16)
    fig.tight_layout(rect=[0, 0, 0.85, 0.95])  # leave space for legend and title
    
    plt.savefig(f"./output/consistency/icc/subsample_{codebook}_interviews-4-50-2.png",
                dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
# Individual plots
for codebook in ['Zero-Shot', 'Few-Shot', 'CoT']:
    subset = agg_results_wide[(agg_results_wide['Codebook'] == codebook)]
    plot_icc_subplots(subset, codebook, color_map, model_order, show_CI=True, grouped_summary_df=grouped_summary_df)



In [None]:
# Aggregate across codebook types
agg_combined = (
    agg_results_wide
    .groupby(['Subsample', 'level_0', 'Rubric'])
    .agg({
        'mean': 'mean',       # average ICC across codebooks
        'lower': 'min',       # min lower bound across codebooks
        'upper': 'max'        # max upper bound across codebooks
    })
    .reset_index()
)

# Call once with the full data
plot_icc_subplots(agg_combined, "Combined", color_map, model_order, show_CI=False, grouped_summary_df=grouped_summary_df)

