In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
gt_df = pd.read_csv("../data/input_data/labeled_full_100_data.csv")

In [3]:
import json
import pandas as pd

def read_jsonl_robust(file_path):
    """Read JSONL with error reporting"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            line = line.strip()
            if line:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error line {line_num}: {e}")
    return data
    

def expand_jsonl_to_error_codes(df):
    """
    Expand JSONL DataFrame to flattened structure with error codes.
    Only keeps subdomains where yes=True and expands error_codes.
    
    Returns DataFrame with columns:
    Index, Domain, Subdomain, Error Code, Rationale, confidence, error_summary, error_highlights, confidence
    """
    expanded_rows = []
    
    for _, row in df.iterrows():
        # Extract basic info
        index_val = row['index']
        domain = row['domain']
        error_summary = row['error_summary']
        error_highlights = row['error_highlights']
        
        # Process subdomains - check if subdomains exists and is not None
        if 'subdomains' in row and row['subdomains'] is not None:
            for subdomain_data in row['subdomains']:
                # Only process subdomains where yes=True
                if subdomain_data.get('yes', False):
                    subdomain_name = subdomain_data.get('subdomain', '')
                    subdomain_rationale = subdomain_data.get('rationale', '')
                    subdomain_confidence = subdomain_data.get('confidence', 0.0)
                    
                    # Process error codes within this subdomain
                    if 'error_codes' in subdomain_data:
                        for error_code_data in subdomain_data['error_codes']:
                            # Only keep error codes where yes=True
                            if error_code_data.get('yes', False):
                                expanded_rows.append({
                                    'Index': index_val,
                                    'Domain': domain,
                                    'Subdomain': subdomain_name,
                                    'Error Code': error_code_data.get('error_code', ''),
                                    'Rationale': error_code_data.get('rationale', ''),
                                    'confidence': error_code_data.get('confidence', 0.0),
                                    'error_summary': error_summary,
                                    'error_highlights': error_highlights,
                                    'subdomain_confidence': subdomain_confidence
                                })
                    else:
                        # If no error_codes but subdomain yes=True, create a row without error code
                        expanded_rows.append({
                            'Index': index_val,
                            'Domain': domain,
                            'Subdomain': subdomain_name,
                            'Error Code': "not labeled by LLM",
                            'Rationale': subdomain_rationale,
                            'confidence': subdomain_confidence,
                            'error_summary': error_summary,
                            'error_highlights': error_highlights,
                            'subdomain_confidence': subdomain_confidence
                        })
    
    return pd.DataFrame(expanded_rows)

In [4]:
two_stage_path = "../DSPy_results_batch_previously_labeled_100_dedup_with_prev_msg_w_ref_colbert_gpt-5/labeler_results.jsonl"
two_stage_df_pivoted= pd.DataFrame(read_jsonl_robust(two_stage_path))
two_stage_df = expand_jsonl_to_error_codes(two_stage_df_pivoted)

In [5]:
one_stage_baseline_path= "../../output_gpt-5__baseline_noopt/results/results_gpt5_baseline.jsonl"
def load_jsonl(path):
    records = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records
one_stage_baseline_df = pd.DataFrame(load_jsonl(one_stage_baseline_path)).drop_duplicates(subset =("index"))

In [6]:
one_stage_op_path= "../../output_gpt-5/results/results_gpt5_mipro.jsonl"
def load_jsonl(path):
    records = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records
one_stage_op_df = pd.DataFrame(load_jsonl(one_stage_op_path))

# binarize gt

In [7]:
label_cols = [c for c in gt_df.columns if c.startswith("label_")]
gt_long = (
    gt_df
    .melt(
        id_vars=["index"],
        value_vars=label_cols,
        var_name="code_key",
        value_name="gt_label"
    )
)
gt_long["gt_label"] = gt_long["gt_label"].astype(int)
gt_binary_unfixed = (
    gt_long
    .pivot(index="index", columns="code_key", values="gt_label")
    .fillna(0)
    .astype(int)
)
gt_binary = gt_binary_unfixed.copy()
gt_binary.columns = gt_binary.columns.str.replace("^label_", "", regex=True)




# binarize two stage

In [8]:

def _slug(text: str) -> str:
    """
    File/identifier safe key for an error code.
    - Preserves underscores inside each hierarchy segment.
    - Uses "__" only to join Domain / Subdomain / Error Code boundaries.
    """
    segments = [seg.strip().lower() for seg in re.split(r"-+", text) if seg.strip()]
    slugged_segments = []
    for seg in segments:
        cleaned = re.sub(r"[^\w]+", "_", seg)
        cleaned = re.sub(r"_+", "_", cleaned).strip("_")
        if cleaned:
            slugged_segments.append(cleaned)
    return "__".join(slugged_segments)
def make_code_key(domain, subdomain, error_code):
    return _slug(f"{domain}-{subdomain}-{error_code}")


In [9]:
two_stage_df["code_key"] = two_stage_df.apply(
    lambda r: make_code_key(r["Domain"], r["Subdomain"], r["Error Code"]),
    axis=1
)
two_stage_pos = two_stage_df[["Index", "code_key"]].drop_duplicates()
two_stage_pos["pred"] = 1
two_stage_binary = (
    two_stage_pos
    .pivot(index="Index", columns="code_key", values="pred")
    .reindex(index=gt_binary.index, columns=gt_binary.columns)
    .fillna(0)
    .astype(int)
)



# binarzie one stage baseline

In [10]:
def expand_one_stage(df):
    rows = []
    for _, r in df.iterrows():
        idx = r["index"]
        for e in r.get("detected_errors", []):
            rows.append({
                "index": idx,
                "code_key": e["code_key"],
                "pred": 1
            })
    return pd.DataFrame(rows)


In [11]:
one_stage_pos = expand_one_stage(one_stage_baseline_df)
one_stage_binary = (
    one_stage_pos
    .pivot(index="index", columns="code_key", values="pred")
    .reindex(index=gt_binary.index, columns=gt_binary.columns)
    .fillna(0)
    .astype(int)
)


# binarize one stage optimizer

In [12]:
one_stage_op_pos = expand_one_stage(one_stage_op_df)
one_stage_op_binary = (
    one_stage_op_pos
    .pivot(index="index", columns="code_key", values="pred")
    .reindex(index=gt_binary.index, columns=gt_binary.columns)
    .fillna(0)
    .astype(int)
)


In [15]:
# gt_binary                 # (N_cases × N_codes)  ground truth
# one_stage_binary          # baseline one-stage
# one_stage_op_binary       # optimized one-stage (mipro)
# two_stage_binary          # two-stage


In [16]:
# “Which system aligns best with GT at the most granular level?”

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score

def eval_binary(gt, pred):
    return {
        "micro_precision": precision_score(gt.values, pred.values, average="micro", zero_division=0),
        "micro_recall": recall_score(gt.values, pred.values, average="micro", zero_division=0),
        "micro_f1": f1_score(gt.values, pred.values, average="micro", zero_division=0),

        "macro_precision": precision_score(gt.values, pred.values, average="macro", zero_division=0),
        "macro_recall": recall_score(gt.values, pred.values, average="macro", zero_division=0),
        "macro_f1": f1_score(gt.values, pred.values, average="macro", zero_division=0),
    }


In [18]:
results = pd.DataFrame.from_dict({
    "one_stage": eval_binary(gt_binary, one_stage_binary),
    "one_stage_optimized": eval_binary(gt_binary, one_stage_op_binary),
    "two_stage": eval_binary(gt_binary, two_stage_binary),
}, orient="index")


In [19]:
results

Unnamed: 0,micro_precision,micro_recall,micro_f1,macro_precision,macro_recall,macro_f1
one_stage,0.068871,0.490196,0.120773,0.06994,0.21124,0.089379
one_stage_optimized,0.069074,0.431373,0.11908,0.062542,0.181059,0.082123
two_stage,0.082908,0.637255,0.146727,0.048631,0.263544,0.076957


In [20]:
def absolute_recall(gt, pred):
    return (gt & pred).sum().sum() / gt.sum().sum()


In [21]:
recall_table = pd.Series({
    "one_stage": absolute_recall(gt_binary, one_stage_binary),
    "one_stage_optimized": absolute_recall(gt_binary, one_stage_op_binary),
    "two_stage": absolute_recall(gt_binary, two_stage_binary),
})


In [30]:
recall_table

one_stage              0.490196
one_stage_optimized    0.431373
two_stage              0.637255
dtype: float64

# hierarhcy form code_key

In [31]:
def parse_hierarchy(code_key):
    parts = code_key.split("__")
    domain = parts[0]
    subdomain = "__".join(parts[:2]) if len(parts) >= 2 else parts[0]
    return domain, subdomain


In [32]:
hierarchy = pd.DataFrame(
    [parse_hierarchy(c) for c in gt_binary.columns],
    index=gt_binary.columns,
    columns=["domain", "subdomain"]
)


In [33]:
def aggregate_binary_by_level(binary_df, level):
    """
    level ∈ {"domain", "subdomain"}
    """
    agg = {}
    for name, codes in hierarchy.groupby(level).groups.items():
        agg[name] = (binary_df[codes].sum(axis=1) > 0).astype(int)
    return pd.DataFrame(agg)


In [34]:
gt_domain = aggregate_binary_by_level(gt_binary, "domain")

one_domain = aggregate_binary_by_level(one_stage_binary, "domain")
op_domain = aggregate_binary_by_level(one_stage_op_binary, "domain")
two_domain = aggregate_binary_by_level(two_stage_binary, "domain")


In [36]:
gt_subdomain = aggregate_binary_by_level(gt_binary, "subdomain")

one_sub = aggregate_binary_by_level(one_stage_binary, "subdomain")
op_sub = aggregate_binary_by_level(one_stage_op_binary, "subdomain")
two_sub = aggregate_binary_by_level(two_stage_binary, "subdomain")


In [37]:
domain_results = pd.DataFrame.from_dict({
    "one_stage": eval_binary(gt_domain, one_domain),
    "one_stage_optimized": eval_binary(gt_domain, op_domain),
    "two_stage": eval_binary(gt_domain, two_domain),
}, orient="index")

subdomain_results = pd.DataFrame.from_dict({
    "one_stage": eval_binary(gt_subdomain, one_sub),
    "one_stage_optimized": eval_binary(gt_subdomain, op_sub),
    "two_stage": eval_binary(gt_subdomain, two_sub),
}, orient="index")


In [38]:
domain_results

Unnamed: 0,micro_precision,micro_recall,micro_f1,macro_precision,macro_recall,macro_f1
one_stage,0.382353,0.890411,0.534979,0.190214,0.530093,0.269121
one_stage_optimized,0.396226,0.863014,0.543103,0.200177,0.513889,0.280338
two_stage,0.357513,0.945205,0.518797,0.197253,0.588889,0.280503


In [39]:
subdomain_results

Unnamed: 0,micro_precision,micro_recall,micro_f1,macro_precision,macro_recall,macro_f1
one_stage,0.130189,0.734043,0.221154,0.073276,0.3455,0.106347
one_stage_optimized,0.132911,0.670213,0.221831,0.066022,0.325174,0.105282
two_stage,0.148936,0.819149,0.252046,0.076433,0.361905,0.122395


In [40]:
def case_level_recall(gt_level, pred_level):
    gt_has = gt_level.sum(axis=1) > 0
    caught = (gt_level & pred_level).sum(axis=1) > 0
    return (caught & gt_has).mean()


In [41]:
domain_case_recall = pd.DataFrame({
    "one_stage": [
        case_level_recall(gt_domain[[d]], one_domain[[d]])
        for d in gt_domain.columns
    ],
    "one_stage_optimized": [
        case_level_recall(gt_domain[[d]], op_domain[[d]])
        for d in gt_domain.columns
    ],
    "two_stage": [
        case_level_recall(gt_domain[[d]], two_domain[[d]])
        for d in gt_domain.columns
    ],
}, index=gt_domain.columns)


In [42]:
subdomain_case_recall = pd.DataFrame({
    "one_stage": [
        case_level_recall(gt_subdomain[[s]], one_sub[[s]])
        for s in gt_subdomain.columns
    ],
    "one_stage_optimized": [
        case_level_recall(gt_subdomain[[s]], op_sub[[s]])
        for s in gt_subdomain.columns
    ],
    "two_stage": [
        case_level_recall(gt_subdomain[[s]], two_sub[[s]])
        for s in gt_subdomain.columns
    ],
}, index=gt_subdomain.columns)


In [43]:
domain_case_recall

Unnamed: 0,one_stage,one_stage_optimized,two_stage
accessibility,0.0,0.0,0.0
bias_stigmatization,0.02,0.02,0.02
clinical_reasoning,0.52,0.51,0.51
communication_quality_readability,0.11,0.1,0.16
privacy_security,0.0,0.0,0.0


In [44]:
subdomain_case_recall

Unnamed: 0,one_stage,one_stage_optimized,two_stage
accessibility__language_accommodation,0.0,0.0,0.0
bias_stigmatization__condition,0.0,0.0,0.0
bias_stigmatization__cultural,0.0,0.0,0.0
bias_stigmatization__identity_respect,0.02,0.02,0.02
bias_stigmatization__sociodemographic,0.0,0.0,0.0
bias_stigmatization__victim_blaming,0.0,0.0,0.0
clinical_reasoning__assessment,0.05,0.05,0.05
clinical_reasoning__clinical_recommendations,0.13,0.1,0.13
clinical_reasoning__comprehension_of_medical_guidelines_and_standard_of_care,0.04,0.04,0.02
clinical_reasoning__comprehension_of_patient_context,0.05,0.02,0.05
