In [1]:
import pandas as pd
import numpy as np

In [2]:
# LLM_label = pd.read_csv('../data/LLM Labels.csv').head(200)
LLM_label = pd.read_csv('/Users/wenyuanchen/Library/CloudStorage/Box-Box/Data/Labled_Data_to_Validate/LLM Labels(Sheet1).csv').head(143)
Physician_label = pd.read_csv('/Users/wenyuanchen/Library/CloudStorage/Box-Box/Data/Labled_Data_to_Validate/Stephen Labels(Sheet1).csv')
Physician_label_human = Physician_label[Physician_label["Source"] == "Human"]
Physician_label_human = Physician_label_human[Physician_label_human["Index"].isin(LLM_label["Index"])]




In [3]:
full_lables = pd.concat([LLM_label, Physician_label_human], ignore_index=True).sort_values(by="Index")

In [4]:
full_lables["Index"].nunique()

50

In [5]:
import pandas as pd

def ensure_sources(df, index_col='Index', source_col='Source'):
    # Set of unique indices
    indices = df[index_col].unique()
    # For each index, ensure all "Human" ,"Baseline" and "Enhanced" exist
    needed_rows = []
    for idx in indices:
        current_sources = set(df.loc[df[index_col]==idx, source_col])
        # If "Human" is missing, add row
        if "Human" not in current_sources:
            new_row = {col: np.nan for col in df.columns}
            new_row[index_col] = idx
            new_row[source_col] = "Human"
            needed_rows.append(new_row)
        # If "Baseline" is missing, add row (if you want to ensure Baseline too)
        if "Baseline" not in current_sources:
            new_row = {col: np.nan for col in df.columns}
            new_row[index_col] = idx
            new_row[source_col] = "Baseline"
            needed_rows.append(new_row)
        if "Enhanced" not in current_sources:
            new_row = {col: np.nan for col in df.columns}
            new_row[index_col] = idx
            new_row[source_col] = "Enhanced"
            needed_rows.append(new_row)
    # Append needed rows, if any
    if needed_rows:
        df = pd.concat([df, pd.DataFrame(needed_rows)], ignore_index=True)
    df = df[df["Source"].isin(["Human", "Baseline", "Enhanced"])]
    return df




In [6]:
full_lables_ensured = ensure_sources(full_lables)

In [7]:
human_baseline = full_lables_ensured[(full_lables_ensured["Source"] == "Human") | (full_lables_ensured["Source"] == "Baseline")]
human_enhanced = full_lables_ensured[(full_lables_ensured["Source"] == "Human") | (full_lables_ensured["Source"] == "Enhanced")]

In [8]:
human_enhanced

Unnamed: 0,Index,Source,Domain,Subdomain,Error Code,Rationale,Free Text Comments
1,2432,Human,Clinical Reasoning,Workflow Recommendations,Violation of Standard Workflows,Clinic appears to schedule directly via messag...,
3,5598,Enhanced,Clinical Reasoning,Comprehension of Patient Query,Off-topic or Irrelevant Reply to Patient Query,The response gives unrelated information about...,
5,11928,Enhanced,Communication Quality & Readability,Clarity,Ambiguous or Conflicting Instructions,The instructions are contradictory since they ...,
6,11928,Enhanced,Clinical Reasoning,Clinical Recommendations,Inappropriate or Omitted Diagnostic Test Recom...,The reply incorrectly advises a new blood draw...,
7,11928,Human,Clinical Reasoning,Comprehension of Patient Context,Incorrect Clinical Information,LLM did not realize that the CBC had already b...,
...,...,...,...,...,...,...,...
253,119042,Human,,,,,
254,120351,Human,,,,,
255,120351,Enhanced,,,,,
256,124222,Human,,,,,


# Analysis begin

# Basic Stats

In [9]:
def basic_stats(df):
    sources = ["Human", "Baseline", "Enhanced"]
    stats = {}
    total_cases = len(df["Index"].unique())
    print("===== Basic Statistics =====\n")
    print(f"Total cases (unique indices): {total_cases}\n")
    for source in sources:
        source_rows = df[df["Source"] == source]
        # Error cases: at least one non-null domain per index
        error_cases = source_rows.groupby("Index")["Domain"].apply(lambda x: x.notnull().any())
        n_error_cases = error_cases.sum()
        error_rate = n_error_cases / total_cases
        print(f"{source}:")
        print(f"  Cases flagged as error: {n_error_cases} ({error_rate:.1%})")
        # Error type breakdowns (total counts, not index-level)
        domain_counts = source_rows["Domain"].value_counts(dropna=True)
        subdomain_counts = source_rows["Subdomain"].value_counts(dropna=True)
        errorcode_counts = source_rows["Error Code"].value_counts(dropna=True)
        print(f"  Error Domain breakdown:")
        for k, v in domain_counts.items():
            print(f"    {k}: {v}")
        print(f"  Error Subdomain breakdown:")
        for k, v in subdomain_counts.items():
            print(f"    {k}: {v}")
        print(f"  Error Code breakdown:")
        for k, v in errorcode_counts.items():
            print(f"    {k}: {v}")
        print("")
        stats[source] = {
            "n_error_cases": n_error_cases,
            "error_rate": error_rate,
            "domain_counts": domain_counts.to_dict(),
            "subdomain_counts": subdomain_counts.to_dict(),
            "errorcode_counts": errorcode_counts.to_dict()
        }
    return stats

Concordance analysis

In [10]:
def concordance_analysis(
    df, source, breakdown=False, index_level_breakdown=False, error_level_concordance=False
):
    from collections import Counter, defaultdict

    cols = ["Index", "Source", "Domain", "Subdomain", "Error Code"]
    df = df[cols]
    indices = df["Index"].unique()
    n_total = len(indices)
    n_nan_domain, n_domain, n_subdomain, n_error = 0, 0, 0, 0

    # Breakdown counters
    domain_concord = Counter()
    subdomain_concord = Counter()
    errorcode_concord = Counter()
    # Index-level concordance
    index_domain_concord = defaultdict(set)
    index_subdomain_concord = defaultdict(set)
    index_errorcode_concord = defaultdict(set)
    # Error-level concordance
    error_level_count = 0

    for idx in indices:
        temp = df[df["Index"] == idx]
        human_rows = temp[temp["Source"] == "Human"]
        baseline_rows = temp[temp["Source"] == source]
        
        found_nan = found_error = found_subdomain = found_domain = False

        for _, hr in human_rows.iterrows():
            for _, br in baseline_rows.iterrows():
                # NaN Domain
                if pd.isna(hr["Domain"]) and pd.isna(br["Domain"]):
                    found_nan = True
                # Error Code
                if pd.notna(hr["Error Code"]) and hr["Error Code"] == br["Error Code"]:
                    found_error = True
                    if breakdown:
                        errorcode_concord[hr["Error Code"]] += 1
                    if index_level_breakdown:
                        index_errorcode_concord[hr["Error Code"]].add(idx)
                # Subdomain
                if pd.notna(hr["Subdomain"]) and hr["Subdomain"] == br["Subdomain"]:
                    found_subdomain = True
                    if breakdown:
                        subdomain_concord[hr["Subdomain"]] += 1
                    if index_level_breakdown:
                        index_subdomain_concord[hr["Subdomain"]].add(idx)
                # Domain
                if pd.notna(hr["Domain"]) and hr["Domain"] == br["Domain"]:
                    found_domain = True
                    if breakdown:
                        domain_concord[hr["Domain"]] += 1
                    if index_level_breakdown:
                        index_domain_concord[hr["Domain"]].add(idx)
                # --- Error-level concordance: all three match and non-null ---
                if (
                    pd.notna(hr["Domain"]) and pd.notna(br["Domain"])
                    and hr["Domain"] == br["Domain"]
                    and pd.notna(hr["Subdomain"]) and pd.notna(br["Subdomain"])
                    and hr["Subdomain"] == br["Subdomain"]
                    and pd.notna(hr["Error Code"]) and pd.notna(br["Error Code"])
                    and hr["Error Code"] == br["Error Code"]
                ):
                    error_level_count += 1

        if found_nan: n_nan_domain += 1
        if found_error: n_error += 1
        if found_subdomain: n_subdomain += 1
        if found_domain: n_domain += 1

    print(f"Total indices: {n_total}")
    print(f"Concordant (NaN Domain): {n_nan_domain}")
    print(f"Concordant (Domain): {n_domain}")
    print(f"Concordant (Subdomain): {n_subdomain}")
    print(f"Concordant (Error Code): {n_error}")

    results = {
        "total": n_total,
        "nan_domain": n_nan_domain,
        "domain": n_domain,
        "subdomain": n_subdomain,
        "error_code": n_error
    }
    if index_level_breakdown:
        print("\nBreakdown by Domain (unique indices):")
        for k, v in index_domain_concord.items():
            print(f"  {k}: {len(v)}")
        print("\nBreakdown by Subdomain (unique indices):")
        for k, v in index_subdomain_concord.items():
            print(f"  {k}: {len(v)}")
        print("\nBreakdown by Error Code (unique indices):")
        for k, v in index_errorcode_concord.items():
            print(f"  {k}: {len(v)}")
        results["domain_index_breakdown"] = {k: len(v) for k, v in index_domain_concord.items()}
        results["subdomain_index_breakdown"] = {k: len(v) for k, v in index_subdomain_concord.items()}
        results["error_code_index_breakdown"] = {k: len(v) for k, v in index_errorcode_concord.items()}

    if breakdown:
        print("\nBreakdown by Domain (pairwise matches):")
        for k, v in domain_concord.items():
            print(f"  {k}: {v}")
        print("\nBreakdown by Subdomain (pairwise matches):")
        for k, v in subdomain_concord.items():
            print(f"  {k}: {v}")
        print("\nBreakdown by Error Code (pairwise matches):")
        for k, v in errorcode_concord.items():
            print(f"  {k}: {v}")
        results["domain_breakdown"] = dict(domain_concord)
        results["subdomain_breakdown"] = dict(subdomain_concord)
        results["error_code_breakdown"] = dict(errorcode_concord)
    

    if error_level_concordance:
        print(f"\nConcordant at full error level (all three match): {error_level_count}")
        results["error_level_count"] = error_level_count

    return results

# Example usage:
# results = concordance_analysis(df, source="Baseline", breakdown=True, index_level_breakdown=True, error_level_concordance=True)


In [11]:
only_human = full_lables_ensured[full_lables_ensured["Source"] == "Human"]
only_baseline = full_lables_ensured[full_lables_ensured["Source"] == "Baseline"]
only_enhanced = full_lables_ensured[full_lables_ensured["Source"] == "Enhanced"]
only_human["Domain"].isnull().sum(), only_baseline["Domain"].isnull().sum(), only_enhanced["Domain"].isnull().sum()


(np.int64(30), np.int64(30), np.int64(33))

In [12]:
only_human["Domain"].isnull().sum(), only_baseline["Domain"].isnull().sum(), only_enhanced["Domain"].isnull().sum()


(np.int64(30), np.int64(30), np.int64(33))

# 1. Human-labeled as No Error, Baseline/Enhanced labeled as Error

In [13]:
def human_noerror_model_error(df, model_source="Baseline"):
    cases = []
    for idx in df["Index"].unique():
        human_rows = df[(df["Index"] == idx) & (df["Source"] == "Human")]
        model_rows = df[(df["Index"] == idx) & (df["Source"] == model_source)]
        # Human no error: All human rows have Domain==NaN
        if human_rows["Domain"].isnull().all():
            # Model has at least one error
            if model_rows["Domain"].notnull().any():
                cases.append(idx)
    return cases


# 2. Model-labeled as No Error, Human labeled as Error

In [14]:
def model_noerror_human_error(df, model_source="Baseline"):
    cases = []
    for idx in df["Index"].unique():
        human_rows = df[(df["Index"] == idx) & (df["Source"] == "Human")]
        model_rows = df[(df["Index"] == idx) & (df["Source"] == model_source)]
        # Model no error: All model rows have Domain==NaN
        if model_rows["Domain"].isnull().all():
            # Human has at least one error
            if human_rows["Domain"].notnull().any():
                cases.append(idx)
    return cases


# 3. When Human labels error and model also labels error, what is the average # of errors model encounters?
## (For cases where Human labeled at least one error AND there’s at least one concordant error, count the number of model error rows)

In [15]:
def avg_model_errors_when_human_error(df, model_source="Baseline"):
    error_counts = []
    for idx in df["Index"].unique():
        human_rows = df[(df["Index"] == idx) & (df["Source"] == "Human")]
        model_rows = df[(df["Index"] == idx) & (df["Source"] == model_source)]
        # Human has error
        if human_rows["Domain"].notnull().any():
            # At least one model row matches an error
            found_match = False
            for _, hr in human_rows.iterrows():
                for _, mr in model_rows.iterrows():
                    if (
                        pd.notna(hr["Domain"]) and pd.notna(mr["Domain"])
                        and hr["Domain"] == mr["Domain"]
                        and pd.notna(hr["Subdomain"]) and pd.notna(mr["Subdomain"])
                        and hr["Subdomain"] == mr["Subdomain"]
                        and pd.notna(hr["Error Code"]) and pd.notna(mr["Error Code"])
                        and hr["Error Code"] == mr["Error Code"]
                    ):
                        found_match = True
            if found_match:
                # How many error rows in model for this index?
                error_counts.append(model_rows["Domain"].notnull().sum())
    if error_counts:
        return sum(error_counts) / len(error_counts)
    return 0


# 4. Count how many cases have errors flagged by both, neither, only one source, etc.

In [16]:
# Count how many cases have errors flagged by both, neither, only one source, etc.
def case_level_confusion_matrix(df):
    results = {"both_error": 0, "only_human": 0, "only_model": 0, "neither": 0}
    for idx in df["Index"].unique():
        human_error = df[(df["Index"] == idx) & (df["Source"] == "Human")]["Domain"].notnull().any()
        model_error = df[(df["Index"] == idx) & (df["Source"] == "Baseline")]["Domain"].notnull().any()
        if human_error and model_error:
            results["both_error"] += 1
        elif human_error and not model_error:
            results["only_human"] += 1
        elif not human_error and model_error:
            results["only_model"] += 1
        else:
            results["neither"] += 1
    return results

# Error count statistics by source
def error_count_stats(df, source):
    counts = []
    for idx in df["Index"].unique():
        rows = df[(df["Index"] == idx) & (df["Source"] == source)]
        counts.append(rows["Domain"].notnull().sum())
    return {"mean": np.mean(counts), "median": np.median(counts), "max": np.max(counts), "min": np.min(counts)}


In [17]:
def summary_report(df):
    # Basic stats
    basic_stats(df)

    # Concordance analyses
    print("===== Concordance Analysis: Baseline vs Human =====\n")
    concordance_analysis(df, source="Baseline", breakdown=True, index_level_breakdown=True, error_level_concordance=True)
    
    print("===== Concordance Analysis: Enhanced vs Human =====\n")
    concordance_analysis(df, source="Enhanced", breakdown=True, index_level_breakdown=True, error_level_concordance=True)

    # Baseline analysis
    print("===== Discrepancy/Agreement Analysis: Baseline =====\n")
    baseline_noerror_cases = human_noerror_model_error(df, model_source="Baseline")
    print(f"Cases where Human labeled as no error but Baseline labeled as error: {len(baseline_noerror_cases)}")
    if len(baseline_noerror_cases) > 0:
        print(f"  Indices: {baseline_noerror_cases}")

    baseline_humanerror_cases = model_noerror_human_error(df, model_source="Baseline")
    print(f"Cases where Baseline labeled as no error but Human labeled as error: {len(baseline_humanerror_cases)}")
    if len(baseline_humanerror_cases) > 0:
        print(f"  Indices: {baseline_humanerror_cases}")

    avg_baseline_errors = avg_model_errors_when_human_error(df, model_source="Baseline")
    print(f"Average # Baseline errors encountered in cases where Human labeled an error and there was concordance: {avg_baseline_errors:.2f}")

    baseline_conf_matrix = case_level_confusion_matrix(df)
    print("\nCase-level Confusion Matrix (Human vs Baseline):")
    for k, v in baseline_conf_matrix.items():
        print(f"  {k}: {v}")

    baseline_err_stats = error_count_stats(df, source="Baseline")
    print("\nBaseline Error Count Stats (per case):")
    for k, v in baseline_err_stats.items():
        print(f"  {k}: {v:.2f}")

    print("\n-----\n")

    # Enhanced analysis
    print("===== Discrepancy/Agreement Analysis: Enhanced =====\n")
    enhanced_noerror_cases = human_noerror_model_error(df, model_source="Enhanced")
    print(f"Cases where Human labeled as no error but Enhanced labeled as error: {len(enhanced_noerror_cases)}")
    if len(enhanced_noerror_cases) > 0:
        print(f"  Indices: {enhanced_noerror_cases}")

    enhanced_humanerror_cases = model_noerror_human_error(df, model_source="Enhanced")
    print(f"Cases where Enhanced labeled as no error but Human labeled as error: {len(enhanced_humanerror_cases)}")
    if len(enhanced_humanerror_cases) > 0:
        print(f"  Indices: {enhanced_humanerror_cases}")

    avg_enhanced_errors = avg_model_errors_when_human_error(df, model_source="Enhanced")
    print(f"Average # Enhanced errors encountered in cases where Human labeled an error and there was concordance: {avg_enhanced_errors:.2f}")

    enhanced_conf_matrix = case_level_confusion_matrix(df)
    print("\nCase-level Confusion Matrix (Human vs Enhanced):")
    for k, v in enhanced_conf_matrix.items():
        print(f"  {k}: {v}")

    enhanced_err_stats = error_count_stats(df, source="Enhanced")
    print("\nEnhanced Error Count Stats (per case):")
    for k, v in enhanced_err_stats.items():
        print(f"  {k}: {v:.2f}")

    print("\n=========================================\n")


In [18]:

summary_report(full_lables_ensured)

===== Basic Statistics =====

Total cases (unique indices): 50

Human:
  Cases flagged as error: 20 (40.0%)
  Error Domain breakdown:
    Clinical Reasoning: 23
  Error Subdomain breakdown:
    Workflow Recommendations: 11
    Comprehension of Patient Query: 3
    Comprehension of Medical Guidelines and Standard of Care: 3
    Comprehension of Patient Context: 2
    Triage: 2
    Assessment: 1
    Traige: 1
  Error Code breakdown:
    Violation of Standard Workflows: 9
    Incorrect Clinical Guideline or Standard of Care: 3
    Incorrect Clinical Information: 2
    Incomplete Response to Patient Query: 2
    Missed Escalation of Care: 2
    Unverified Workflow Assumption: 2
    Omitted Verification of Incomplete Patient Information: 1
    Role-Based Scope Violation: 1
    Misinterpretation of Clinical Query: 1

Baseline:
  Cases flagged as error: 20 (40.0%)
  Error Domain breakdown:
    Clinical Reasoning: 35
    Communication Quality & Readability: 19
    Bias & Stigmatization: 4
    