In [2]:
import pandas as pd
from scipy.stats import kendalltau, spearmanr, pearsonr
from IPython.display import display

DATA_DIR = "/home/mob999/CRIMSON/data/radiology-report-expert-evaluation-rexval-dataset-1.0.0"
RESULTS_DIR = "/home/mob999/CRIMSON/data/rexval_results/crimsonv0_c13fc28/"

# --- 1. Load and aggregate expert error counts ---
errors = pd.read_csv(f"{DATA_DIR}/6_valid_raters_per_rater_error_categories.csv")

# Total errors per (study_number, candidate_type, rater_index) across all error categories & significance levels
errors_per_rater = (
    errors.groupby(["study_number", "candidate_type", "rater_index"])["num_errors"]
    .sum()
    .reset_index()
)

# Average across raters → one value per (study_number, candidate_type)
avg_errors = (
    errors_per_rater.groupby(["study_number", "candidate_type"])["num_errors"]
    .mean()
    .reset_index(name="avg_errors")
)

# --- 2. Map candidate_type → results file ---
file_map = {
    "radgraph": "results_radgraph_gt.csv",
    # "bertscore": "results_radgraph_gt_bertscore.csv",
    # "bleu": "results_radgraph_gt_bleu.csv",
    # "s_emb": "results_radgraph_gt_s_emb.csv",
}

metrics = [
    "radgraph_complete", "bleu", "bertscore", "green", "rougeL",
    "chexbert-5_micro avg_f1-score", "ratescore", "radcliq-v1", "crimson_score",
]

# --- 3. Compute correlations per candidate type ---
for ctype, fname in file_map.items():
    results = pd.read_csv(f"{RESULTS_DIR}/{fname}")
    results = results[pd.to_numeric(results["id"], errors="coerce").notna()].copy()
    results["id"] = results["id"].astype(int)
    cols = ["id"] + [m for m in metrics if m in results.columns]
    results = results[cols].rename(columns={"id": "study_number"})

    merged = avg_errors[avg_errors["candidate_type"] == ctype].merge(results, on="study_number")

    rows = []
    for metric in metrics:
        if metric not in merged.columns:
            continue
        # Drop rows where either value is NaN
        valid = merged[["avg_errors", metric]].dropna()
        x, y = valid["avg_errors"], valid[metric]
        kt, kp = kendalltau(x, y)
        sr, sp = spearmanr(x, y)
        pr, pp = pearsonr(x, y)
        rows.append({
            "metric": metric,
            "n": len(valid),
            "kendall_tau": round(kt, 4),
            "kendall_p": round(kp, 4),
            "spearman_r": round(sr, 4),
            "spearman_p": round(sp, 4),
            "pearson_r": round(pr, 4),
            "pearson_p": round(pp, 4),
        })

    print(f"\n=== {ctype} ===")
    display(pd.DataFrame(rows).set_index("metric"))


=== radgraph ===


Unnamed: 0_level_0,n,kendall_tau,kendall_p,spearman_r,spearman_p,pearson_r,pearson_p
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
radgraph_complete,50,-0.5704,0.0,-0.7477,0.0,-0.6283,0.0
bleu,50,-0.0917,0.4068,-0.1039,0.4726,-0.1902,0.1858
bertscore,50,-0.417,0.0,-0.5824,0.0,-0.5267,0.0001
green,50,-0.5647,0.0,-0.7441,0.0,-0.6599,0.0
rougeL,50,-0.5079,0.0,-0.672,0.0,-0.6146,0.0
chexbert-5_micro avg_f1-score,50,0.2962,0.0098,0.4025,0.0038,0.3888,0.0053
ratescore,50,-0.5596,0.0,-0.7389,0.0,-0.6604,0.0
radcliq-v1,50,0.1121,0.2653,0.2255,0.1154,0.0313,0.8289
crimson_score,50,-0.5715,0.0,-0.7238,0.0,-0.6612,0.0


In [9]:
# --- Clinically significant errors only ---
errors_sig = errors[errors["clinically_significant"] == True]

errors_sig_per_rater = (
    errors_sig.groupby(["study_number", "candidate_type", "rater_index"])["num_errors"]
    .sum()
    .reset_index()
)

avg_errors_sig = (
    errors_sig_per_rater.groupby(["study_number", "candidate_type"])["num_errors"]
    .mean()
    .reset_index(name="avg_errors")
)

for ctype, fname in file_map.items():
    results = pd.read_csv(f"{RESULTS_DIR}/{fname}")
    results = results[pd.to_numeric(results["id"], errors="coerce").notna()].copy()
    results["id"] = results["id"].astype(int)
    cols = ["id"] + [m for m in metrics if m in results.columns]
    results = results[cols].rename(columns={"id": "study_number"})

    merged = avg_errors_sig[avg_errors_sig["candidate_type"] == ctype].merge(results, on="study_number")

    rows = []
    for metric in metrics:
        if metric not in merged.columns:
            continue
        valid = merged[["avg_errors", metric]].dropna()
        x, y = valid["avg_errors"], valid[metric]
        kt, kp = kendalltau(x, y)
        sr, sp = spearmanr(x, y)
        pr, pp = pearsonr(x, y)
        rows.append({
            "metric": metric,
            "n": len(valid),
            "kendall_tau": round(kt, 4),
            "kendall_p": round(kp, 4),
            "spearman_r": round(sr, 4),
            "spearman_p": round(sp, 4),
            "pearson_r": round(pr, 4),
            "pearson_p": round(pp, 4),
        })

    print(f"\n=== {ctype} (clinically significant only) ===")
    display(pd.DataFrame(rows).set_index("metric"))


=== radgraph (clinically significant only) ===


Unnamed: 0_level_0,n,kendall_tau,kendall_p,spearman_r,spearman_p,pearson_r,pearson_p
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
radgraph_complete,50,-0.5934,0.0,-0.7636,0.0,-0.6017,0.0
bleu,50,-0.1287,0.2481,-0.1553,0.2816,-0.2269,0.1131
bertscore,50,-0.4627,0.0,-0.6411,0.0,-0.5406,0.0001
green,50,-0.6167,0.0,-0.7787,0.0,-0.646,0.0
rougeL,50,-0.5402,0.0,-0.7138,0.0,-0.5975,0.0
chexbert-5_micro avg_f1-score,50,0.2877,0.0128,0.377,0.007,0.331,0.0189
ratescore,50,-0.5708,0.0,-0.7352,0.0,-0.6227,0.0
radcliq-v1,50,0.123,0.2255,0.2514,0.0783,0.0568,0.6953
crimson_score,50,-0.5501,0.0,-0.6977,0.0,-0.6049,0.0
