In [1]:
import pandas as pd

In [2]:
# hardcoded from SPM's notebook
first_n_indices = [2432,
 19667,
 21392,
 22585,
 22789,
 25822,
 27428,
 38938,
 39901,
 44402,
 46039,
 49823,
 53391,
 60920,
 63940,
 78318,
 79480,
 82643,
 94483,
 100157,
 101519,
 106367,
 106820,
 110718,
 111440,
 117036,
 124222,
 134264,
 140824,
 146126,
 149506,
 160988,
 173987,
 190304,
 208055,
 217691,
 223678,
 231119,
 239336,
 241520,
 241620,
 248166,
 265165,
 265647,
 283211,
 285732,
 290398,
 295864,
 305128,
 314021,
 5598,
 11928,
 14858,
 29023,
 30451,
 39933,
 41322,
 49714,
 55744,
 65791,
 72081,
 83153,
 85971,
 89405,
 99713,
 104179,
 108523,
 110120,
 113473,
 118469,
 120351,
 124776,
 135202,
 139937,
 140373,
 145813,
 147664,
 159295,
 160258,
 166720,
 171849,
 176142,
 180195,
 184776,
 188565,
 190057,
 197077,
 205221,
 207720,
 214001,
 231948,
 233511,
 238215,
 240196,
 248247,
 251860,
 258084,
 262055,
 279586,
 281063]

In [3]:
physician_validation_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Data/Stephen Labels.xlsx"
llm_labels_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Data/LLM Labels.xlsx"
codebook_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Pipeline_v2_data/dedup_codebook.csv"

In [4]:
Physician_label_all= pd.read_excel(physician_validation_path)
LLM_label_all   = pd.read_excel(llm_labels_path)
codebook_df = pd.read_csv(codebook_path)[["Dedup Error Code", "Dedup Subdomain", "Dedup Domain", "Dedup Definition"]]

In [5]:
LLM_label       = LLM_label_all[LLM_label_all["Index"].isin(first_n_indices)]
Physician_label = Physician_label_all[Physician_label_all["Index"].isin(first_n_indices)]

# build training sets for DSPY prompt optimizer

In [6]:
import sys
import pandas as pd
import re

repo_src = "/Users/sandychen/Desktop/Healthrex_workspace/scripts/antibiotic-susceptibility/Aim 4/AIM4/Embedding_Pilot_Exp/src"
if repo_src not in sys.path:
    sys.path.append(repo_src)


In [7]:

def _slug(text: str) -> str:
    """
    File/identifier safe key for an error code.
    - Preserves underscores inside each hierarchy segment.
    - Uses "__" only to join Domain / Subdomain / Error Code boundaries.
    """
    segments = [seg.strip().lower() for seg in re.split(r"-+", text) if seg.strip()]
    slugged_segments = []
    for seg in segments:
        cleaned = re.sub(r"[^\w]+", "_", seg)
        cleaned = re.sub(r"_+", "_", cleaned).strip("_")
        if cleaned:
            slugged_segments.append(cleaned)
    return "__".join(slugged_segments)
codebook = pd.read_csv("../data/input_data/dedup_codebook.csv")
codebook["code_key"] = codebook.apply(
    lambda r: _slug(f"{r['Dedup Domain']}-{r['Dedup Subdomain']}-{r['Dedup Error Code']}"), axis=1
)
all_code_keys = sorted(codebook["code_key"].unique())

In [8]:
baseline_input_data_path = "../DSPy_results_batch_previously_labeled_100_dedup_with_prev_msg/identifier_results_input.jsonl"
enhanced_input_data_path = "../DSPy_results_batch_previously_labeled_100_dedup_with_prev_msg_w_ref/identifier_results_input.jsonl"

In [9]:
import json
with open(baseline_input_data_path) as f:
    records = [json.loads(line) for line in f if line.strip()]
baseline_input_data_df = pd.DataFrame(records)
with open(enhanced_input_data_path) as f:
    records = [json.loads(line) for line in f if line.strip()]
enhanced_input_data_df = pd.DataFrame(records)


In [10]:
import pandas as pd
import re
def make_lookup(df, keep="first", name="df"):
    dupes = df[df["index"].duplicated()]["index"].unique()
    if len(dupes) > 0:
        print(f"[warn] {name} has duplicate index values: {dupes}")
    return (
        df.sort_values("index")
          .drop_duplicates(subset="index", keep=keep)
          .set_index("index")
    )

baseline_lookup = make_lookup(baseline_input_data_df, keep="first", name="baseline")
enhanced_lookup = make_lookup(enhanced_input_data_df, keep="first", name="enhanced")

In [11]:
# ---------- prepare labels (only valid rows) ----------
valid_labels = Physician_label.dropna(subset=["Source", "Domain", "Subdomain", "Error Code"]).copy()
valid_labels["code_key"] = valid_labels.apply(
    lambda r: _slug(f"{r['Domain']}-{r['Subdomain']}-{r['Error Code']}"), axis=1
)

In [32]:
baseline_lookup = make_lookup(baseline_input_data_df, keep="first", name="baseline")
enhanced_lookup = make_lookup(enhanced_input_data_df, keep="first", name="enhanced")

def select_input_row(idx: int, labels_df):
    srcs = set(labels_df[labels_df["Index"] == idx]["Source"].dropna().unique())
    preferred = enhanced_lookup if ("Enhanced" in srcs or "Human" in srcs) else baseline_lookup
    if idx not in preferred.index:
        return None
    return preferred.loc[idx]

In [37]:
# ---------- build training records ----------
train_records = []
for idx in sorted(first_n_indices):
    row = select_input_row(idx, valid_labels)
    if row is None:
        print(f"[warn] missing input row for index {idx}, skipping")
        continue

    record = {
        "index": idx,
        "patient_message": row.patient_message,
        "llm_response": row.llm_response,
        "patient_info": row.patient_info,
        "clinical_notes": row.clinical_notes,
        "previous_messages": row.previous_messages,
        "retrieved_pairs": row.retrieved_pairs if pd.notna(getattr(row, "retrieved_pairs", "")) else "",
    }
    # init all codes to negatives/empty
    for ck in all_code_keys:
        record[f"label_{ck}"] = False
        record[f"rationale_{ck}"] = ""
        record[f"excerpt_{ck}"] = ""

    # mark positives for this index
    labs = valid_labels[valid_labels["Index"] == idx]
    for _, lab in labs.iterrows():
        ck = lab["code_key"]
        record[f"label_{ck}"] = True
        record[f"rationale_{ck}"] = lab.get("Rationale", "") or ""
        # Add excerpt if you have one; else leave blank (optimizer will default to Not Applicable for negatives)
    train_records.append(record)


In [38]:
train_df = pd.DataFrame(train_records)
# assume all_code_keys is the list of your 59 code_keys
label_cols = [f"label_{ck}" for ck in all_code_keys]

# Boolean flag: True if any code is positive, else False
train_df["any_error_present"] = train_df[label_cols].any(axis=1)

# Optionally, store the list of positive codes for that row
train_df["positive_codes"] = train_df[label_cols].apply(
    lambda r: [ck for ck, v in zip(all_code_keys, r.values) if v], axis=1
)

In [39]:
len(train_df[train_df["any_error_present"]]["index"].unique())

56

In [40]:
len(valid_labels["Index"].unique())

57

## the above should be the same, after digging, realized that one of the codes labelled by human is not in the codebook. 
### manually fixed


In [44]:
# --- Hard fix for index 140373 missing the ICC code_key mapping ---
override_ck = "clinical_reasoning__comprehension_of_patient_context__incorrect_clinical_information"
ck_col = f"label_{override_ck}"

# Ensure the column exists
if ck_col not in train_df.columns:
    raise ValueError(f"Expected column {ck_col} not found in train_df")

# Set the label (and rationale/optional excerpt) for this index
train_df.loc[train_df["index"] == 140373, ck_col] = True
train_df.loc[train_df["index"] == 140373, f"rationale_{override_ck}"] = (
    "The instruction missed that the maintenance dose was incorrect."
)
# optional excerpt if you have one:
# train_df.loc[train_df["index"] == 140373, f"excerpt_{override_ck}"] = "<verbatim span>"

# Recompute any_error_present and positive_codes
label_cols = [c for c in train_df.columns if c.startswith("label_")]
train_df["any_error_present"] = train_df[label_cols].any(axis=1)
train_df["positive_codes"] = train_df[label_cols].apply(
    lambda r: [c.replace("label_", "") for c, v in zip(label_cols, r.values) if v], axis=1
)

print("Index 140373 any_error_present:",
      train_df.loc[train_df["index"] == 140373, "any_error_present"].item())
print("Positive codes:",
      train_df.loc[train_df["index"] == 140373, "positive_codes"].item())

Index 140373 any_error_present: True
Positive codes: ['clinical_reasoning__comprehension_of_patient_context__incorrect_clinical_information', 'clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information']


In [49]:
train_df["any_error_present"].value_counts()

any_error_present
True     57
False    43
Name: count, dtype: int64

In [51]:
# Sample positives and negatives (up to 20 each)
pos_df = train_df[train_df["any_error_present"]]
neg_df = train_df[~train_df["any_error_present"]]

# Adjust n if you have fewer than 20 in either group
n_pos = min(20, len(pos_df))
n_neg = min(20, len(neg_df))

pos_sample = pos_df.sample(n=n_pos, random_state=42) if n_pos > 0 else pd.DataFrame(columns=train_df.columns)
neg_sample = neg_df.sample(n=n_neg, random_state=42) if n_neg > 0 else pd.DataFrame(columns=train_df.columns)

train_subset = pd.concat([pos_sample, neg_sample], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

out_path = "../data/input_data/labeled_modular_training_20pos_20neg.csv"
train_subset.to_csv(out_path, index=False)
print(f"Saved {len(train_subset)} rows to {out_path} (pos={n_pos}, neg={n_neg})")

Saved 40 rows to ../data/input_data/labeled_modular_training_20pos_20neg.csv (pos=20, neg=20)


In [52]:
raw_data_path = "../data/input_data/prospective_sample_100_unified_previously_labeled_from_colbert_cleaned.csv"

In [53]:
raw_data_df = pd.read_csv(raw_data_path)