In [58]:
import pandas as pd

In [2]:
# hardcoded from SPM's notebook
first_n_indices = [2432,
 19667,
 21392,
 22585,
 22789,
 25822,
 27428,
 38938,
 39901,
 44402,
 46039,
 49823,
 53391,
 60920,
 63940,
 78318,
 79480,
 82643,
 94483,
 100157,
 101519,
 106367,
 106820,
 110718,
 111440,
 117036,
 124222,
 134264,
 140824,
 146126,
 149506,
 160988,
 173987,
 190304,
 208055,
 217691,
 223678,
 231119,
 239336,
 241520,
 241620,
 248166,
 265165,
 265647,
 283211,
 285732,
 290398,
 295864,
 305128,
 314021,
 5598,
 11928,
 14858,
 29023,
 30451,
 39933,
 41322,
 49714,
 55744,
 65791,
 72081,
 83153,
 85971,
 89405,
 99713,
 104179,
 108523,
 110120,
 113473,
 118469,
 120351,
 124776,
 135202,
 139937,
 140373,
 145813,
 147664,
 159295,
 160258,
 166720,
 171849,
 176142,
 180195,
 184776,
 188565,
 190057,
 197077,
 205221,
 207720,
 214001,
 231948,
 233511,
 238215,
 240196,
 248247,
 251860,
 258084,
 262055,
 279586,
 281063]

In [59]:
physician_validation_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Data/Stephen Labels.xlsx"
llm_labels_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Data/LLM Labels.xlsx"
codebook_path = "/Users/sandychen/Library/CloudStorage/Box-Box/Prospective Labeling V3/Pipeline_v2_data/dedup_codebook.csv"

In [4]:
Physician_label_all= pd.read_excel(physician_validation_path)
LLM_label_all   = pd.read_excel(llm_labels_path)
codebook_df = pd.read_csv(codebook_path)[["Dedup Error Code", "Dedup Subdomain", "Dedup Domain", "Dedup Definition"]]

In [5]:
LLM_label       = LLM_label_all[LLM_label_all["Index"].isin(first_n_indices)]
Physician_label = Physician_label_all[Physician_label_all["Index"].isin(first_n_indices)]

# build training sets for DSPY prompt optimizer

In [6]:
import sys
import pandas as pd
import re

repo_src = "/Users/sandychen/Desktop/Healthrex_workspace/scripts/antibiotic-susceptibility/Aim 4/AIM4/Embedding_Pilot_Exp/src"
if repo_src not in sys.path:
    sys.path.append(repo_src)


In [7]:

def _slug(text: str) -> str:
    """
    File/identifier safe key for an error code.
    - Preserves underscores inside each hierarchy segment.
    - Uses "__" only to join Domain / Subdomain / Error Code boundaries.
    """
    segments = [seg.strip().lower() for seg in re.split(r"-+", text) if seg.strip()]
    slugged_segments = []
    for seg in segments:
        cleaned = re.sub(r"[^\w]+", "_", seg)
        cleaned = re.sub(r"_+", "_", cleaned).strip("_")
        if cleaned:
            slugged_segments.append(cleaned)
    return "__".join(slugged_segments)
codebook = pd.read_csv("../data/input_data/dedup_codebook.csv")
codebook["code_key"] = codebook.apply(
    lambda r: _slug(f"{r['Dedup Domain']}-{r['Dedup Subdomain']}-{r['Dedup Error Code']}"), axis=1
)
all_code_keys = sorted(codebook["code_key"].unique())

In [8]:
baseline_input_data_path = "../DSPy_results_batch_previously_labeled_100_dedup_with_prev_msg/identifier_results_input.jsonl"
enhanced_input_data_path = "../DSPy_results_batch_previously_labeled_100_dedup_with_prev_msg_w_ref/identifier_results_input.jsonl"

In [9]:
import json
with open(baseline_input_data_path) as f:
    records = [json.loads(line) for line in f if line.strip()]
baseline_input_data_df = pd.DataFrame(records)
with open(enhanced_input_data_path) as f:
    records = [json.loads(line) for line in f if line.strip()]
enhanced_input_data_df = pd.DataFrame(records)


In [10]:
import pandas as pd
import re
def make_lookup(df, keep="first", name="df"):
    dupes = df[df["index"].duplicated()]["index"].unique()
    if len(dupes) > 0:
        print(f"[warn] {name} has duplicate index values: {dupes}")
    return (
        df.sort_values("index")
          .drop_duplicates(subset="index", keep=keep)
          .set_index("index")
    )

baseline_lookup = make_lookup(baseline_input_data_df, keep="first", name="baseline")
enhanced_lookup = make_lookup(enhanced_input_data_df, keep="first", name="enhanced")

In [11]:
# ---------- prepare labels (only valid rows) ----------
valid_labels = Physician_label.dropna(subset=["Source", "Domain", "Subdomain", "Error Code"]).copy()
valid_labels["code_key"] = valid_labels.apply(
    lambda r: _slug(f"{r['Domain']}-{r['Subdomain']}-{r['Error Code']}"), axis=1
)

In [32]:
baseline_lookup = make_lookup(baseline_input_data_df, keep="first", name="baseline")
enhanced_lookup = make_lookup(enhanced_input_data_df, keep="first", name="enhanced")

def select_input_row(idx: int, labels_df):
    srcs = set(labels_df[labels_df["Index"] == idx]["Source"].dropna().unique())
    preferred = enhanced_lookup if ("Enhanced" in srcs or "Human" in srcs) else baseline_lookup
    if idx not in preferred.index:
        return None
    return preferred.loc[idx]

In [37]:
# ---------- build training records ----------
train_records = []
for idx in sorted(first_n_indices):
    row = select_input_row(idx, valid_labels)
    if row is None:
        print(f"[warn] missing input row for index {idx}, skipping")
        continue

    record = {
        "index": idx,
        "patient_message": row.patient_message,
        "llm_response": row.llm_response,
        "patient_info": row.patient_info,
        "clinical_notes": row.clinical_notes,
        "previous_messages": row.previous_messages,
        "retrieved_pairs": row.retrieved_pairs if pd.notna(getattr(row, "retrieved_pairs", "")) else "",
    }
    # init all codes to negatives/empty
    for ck in all_code_keys:
        record[f"label_{ck}"] = False
        record[f"rationale_{ck}"] = ""
        record[f"excerpt_{ck}"] = ""

    # mark positives for this index
    labs = valid_labels[valid_labels["Index"] == idx]
    for _, lab in labs.iterrows():
        ck = lab["code_key"]
        record[f"label_{ck}"] = True
        record[f"rationale_{ck}"] = lab.get("Rationale", "") or ""
        # Add excerpt if you have one; else leave blank (optimizer will default to Not Applicable for negatives)
    train_records.append(record)


In [38]:
train_df = pd.DataFrame(train_records)
# assume all_code_keys is the list of your 59 code_keys
label_cols = [f"label_{ck}" for ck in all_code_keys]

# Boolean flag: True if any code is positive, else False
train_df["any_error_present"] = train_df[label_cols].any(axis=1)

# Optionally, store the list of positive codes for that row
train_df["positive_codes"] = train_df[label_cols].apply(
    lambda r: [ck for ck, v in zip(all_code_keys, r.values) if v], axis=1
)

In [39]:
len(train_df[train_df["any_error_present"]]["index"].unique())

56

In [40]:
len(valid_labels["Index"].unique())

57

## the above should be the same, after digging, realized that one of the codes labelled by human is not in the codebook. 
### manually fixed


In [79]:

for i in train_df.columns:
    if "label_" in i:
        if all_code_keys.count(i.replace("label_", "")) == 0:
            print(i)

label_clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information


In [None]:
"clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information" in

False

In [82]:
train_df["label_clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information"].sum()

1

In [71]:
"clinical_reasoning__comprehension_of_patient_context__incorrect_clinical_information" in all_code_keys

True

In [83]:
# --- Hard fix for index 140373 missing the ICC code_key mapping ---
override_ck = "clinical_reasoning__comprehension_of_patient_context__incorrect_clinical_information"
ck_col = f"label_{override_ck}"

# Ensure the column exists
if ck_col not in train_df.columns:
    raise ValueError(f"Expected column {ck_col} not found in train_df")

# Set the label (and rationale/optional excerpt) for this index
train_df.loc[train_df["index"] == 140373, ck_col] = True
train_df.loc[train_df["index"] == 140373, f"rationale_{override_ck}"] = (
    "The instruction missed that the maintenance dose was incorrect."
)
# optional excerpt if you have one:
# train_df.loc[train_df["index"] == 140373, f"excerpt_{override_ck}"] = "<verbatim span>"

# --- Drop columns containing the incorrect code_key (query instead of context) ---
wrong_code_key = "clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information"
cols_to_drop = [c for c in train_df.columns if wrong_code_key in c]
print(f"Dropping columns with incorrect code_key: {cols_to_drop}")
train_df = train_df.drop(columns=cols_to_drop)

# Recompute any_error_present and positive_codes
label_cols = [c for c in train_df.columns if c.startswith("label_")]
train_df["any_error_present"] = train_df[label_cols].any(axis=1)
train_df["positive_codes"] = train_df[label_cols].apply(
    lambda r: [c.replace("label_", "") for c, v in zip(label_cols, r.values) if v], axis=1
)

print("Index 140373 any_error_present:",
      train_df.loc[train_df["index"] == 140373, "any_error_present"].item())
print("Positive codes:",
      train_df.loc[train_df["index"] == 140373, "positive_codes"].item())

Dropping columns with incorrect code_key: ['label_clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information', 'rationale_clinical_reasoning__comprehension_of_patient_query__incorrect_clinical_information']
Index 140373 any_error_present: True
Positive codes: ['clinical_reasoning__comprehension_of_patient_context__incorrect_clinical_information']


In [90]:
train_df

Unnamed: 0,index,patient_message,llm_response,patient_info,clinical_notes,previous_messages,retrieved_pairs,label_accessibility__language_accommodation__lack_of_language_accommodation,rationale_accessibility__language_accommodation__lack_of_language_accommodation,excerpt_accessibility__language_accommodation__lack_of_language_accommodation,...,rationale_privacy_security__proxy_disclosure__inappropriate_proxy_disclosure,excerpt_privacy_security__proxy_disclosure__inappropriate_proxy_disclosure,label_privacy_security__proxy_disclosure__missing_proxy_access_instructions,rationale_privacy_security__proxy_disclosure__missing_proxy_access_instructions,excerpt_privacy_security__proxy_disclosure__missing_proxy_access_instructions,label_privacy_security__restricted_information_disclosure__disallowed_disclosure_restricted_info_shared_with_patient,rationale_privacy_security__restricted_information_disclosure__disallowed_disclosure_restricted_info_shared_with_patient,excerpt_privacy_security__restricted_information_disclosure__disallowed_disclosure_restricted_info_shared_with_patient,any_error_present,positive_codes
0,2432,I’d like to schedule a virtual visit with Dr C...,"Hi Micah, I'd be happy to help you with that. ...","Healthcare Provider Name: Chen, Eunice Shin De...",Clinical Note for Encounter on 11/20/2024 Tele...,I’d like to schedule a virtual visit with Dr C...,"[{""patient_message"": ""Awesome; thank you so mu...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__workflow_recommendations_...
1,5598,Thank you Cherry. Yes the 22nd at 11 am is gre...,I don't have access to real-time data. How can...,,,,"[{""patient_message"": ""Hi Kasslen,That date/tim...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__comprehension_of_patient_...
2,11928,"Dr. Powell, from my blood vials can the lab te...","Rafi, For the CBC and Uric Acid tests, you wil...","Healthcare Provider Name: Powell, Anthony Mich...",Clinical Note for Encounter on 5/22/2024 61 Y ...,"Dr. Powell, from my blood vials can the lab te...","[{""patient_message"": ""CBC was not done yesterd...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__clinical_recommendations_...
3,14858,Marshall needs this faxed to them,"Carol, For any paperwork requests, please foll...","Healthcare Provider Name: Esparza, Ana Departm...",,,"[{""patient_message"": ""The fax number is 313576...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__clinical_recommendations_...
4,19667,Seizure 1/6/2025 8:33 am & 4:16 pmNo pulling o...,Thank you for providing the details of your re...,"Healthcare Provider Name: Villanueva, Eleanor ...",Patient is a 44 year old female with left fron...,Seizure 1/6/2025 8:33 am & 4:16 pm No pulling ...,"[{""patient_message"": ""Hello Eleanor and Dr Nag...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__comprehension_of_patient_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,285732,Sorry about that. Attached here.,"Hi Amanda, Thank you for the update. It's good...","Healthcare Provider Name: Jiang, Can Departmen...",Clinical Note for Encounter on 12/4/2024 STANF...,"Hi Dr. Jiang, FYI I just saw an allergist outs...","[{""patient_message"": ""This message is being se...",False,,,...,,,False,,,False,,,True,[clinical_reasoning__assessment__omitted_verif...
96,290398,"Hi Dr. Fresco,I have my annual check with you ...","Hi Sharon, Thank you for reaching out. I have ...","Healthcare Provider Name: Fresco, Linda Hannah...",Clinical Note for Encounter on 10/23/2023 Shar...,"Hi Dr. Fresco, I have my annual check with you...",,False,,,...,,,False,,,False,,,False,[]
97,295864,Hi there! The pharmacy told me unless my dosag...,"Hi Kalie, Thank you for reaching out. I unders...","Healthcare Provider Name: Fernandez-Becker, Ni...",Patient is a 28 year old female with history o...,Hi there! The pharmacy told me unless my dosag...,,False,,,...,,,False,,,False,,,False,[]
98,305128,Sounds good. We can take a look at the new tes...,"Hi, John- Thanks for letting me know. I will g...","Healthcare Provider Name: Suffoletto, Jo-Anne ...",Clinical Note for Encounter on 9/23/2024 Ambul...,"Hi, John- Thanks for your speedy response. I h...",,False,,,...,,,False,,,False,,,False,[]


In [85]:
train_df.to_csv("../data/input_data/labeled_full_100_data.csv", index=False)

In [84]:
train_df.columns.str.contains("label_").sum()

59

In [51]:
# Sample positives and negatives (up to 20 each)
pos_df = train_df[train_df["any_error_present"]]
neg_df = train_df[~train_df["any_error_present"]]

# Adjust n if you have fewer than 20 in either group
n_pos = min(20, len(pos_df))
n_neg = min(20, len(neg_df))

pos_sample = pos_df.sample(n=n_pos, random_state=42) if n_pos > 0 else pd.DataFrame(columns=train_df.columns)
neg_sample = neg_df.sample(n=n_neg, random_state=42) if n_neg > 0 else pd.DataFrame(columns=train_df.columns)

train_subset = pd.concat([pos_sample, neg_sample], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

out_path = "../data/input_data/labeled_modular_training_20pos_20neg.csv"
train_subset.to_csv(out_path, index=False)
print(f"Saved {len(train_subset)} rows to {out_path} (pos={n_pos}, neg={n_neg})")

Saved 40 rows to ../data/input_data/labeled_modular_training_20pos_20neg.csv (pos=20, neg=20)


In [52]:
raw_data_path = "../data/input_data/prospective_sample_100_unified_previously_labeled_from_colbert_cleaned.csv"

In [53]:
raw_data_df = pd.read_csv(raw_data_path)

In [57]:
raw_data_df

Unnamed: 0.1,Unnamed: 0,index,Thread ID,Date Sent,Subject,Patient Message,Message Sender,Actual Response Sent to Patient,Recipient Names,Recipient IDs,...,Thread ID_6,Patient Message_6,Message Sender_6,Message Department_6,Department Specialty Title_6,Actual Response Sent to Patient_6,cosine_similarity_6,Data,Last Note,Previous Messages
0,82307,106367,252594691,2024-12-27,RE:Non-urgent Medical Question,Agreed and understood. My question was more ar...,"TACMO, WENDY [ S0282892]",The brain MRI did not show any obvious or acut...,CCSB MEDICAL ONCOLOGY CLINICAL,POOL 1150000007,...,250487333,I have a question about Lab Unlisted 1 resulte...,"MYCHART, GENERIC [ T200162]",STANFORD PRIMARY CARE - PORTOLA VALLEY,Primary Care,"Hi Nicole,Dr. Angelo is out of the office and ...",18.890625,"Healthcare Provider Name: Pham, Alexander Darw...",Patient Name: William Louis Hayward Visit Date...,"Good Morning Mr. Hayward, I will try to contac..."
1,1521,111440,255766322,2025-01-15,RE: Scheduling Question,I can do Tuesdays Thursdays 8am. Thank you.,"CALESTAR, MEGAN [ S0180617]","Hi Winnie,Would you like to move your visit to...","CC THOR ONC MED CLINICAL,","POOL 10419,",...,249164804,"Yes, thank you I can come in at 9:40 on Thursd...","MYCHART, GENERIC [ T200162]",PRIMARY CARE STANFORD MEDICINE PARTNERS PLEASA...,Primary Care,"Hi Susan,I scheduled you for Thursday at 9:40a...",22.750000,"Healthcare Provider Name: Das, Millie Snigdha ...",Patient is a 55-year-old female with history o...,"Hi, I cannot do 1/28 9:45am anymore in person...."
2,72634,258084,253020612,2024-12-11,Ordered Test Question,Please send the imagers of the chest to Dr Hak...,"MYCHART, GENERIC [ T200162]","Hello Ms. Brown-Quintero,For images, our clini...",CC SARCOMA CLINICAL,POOL 10428,...,251916767,it was done this morning at Kaweah Imaging in ...,"LAFORGA, JACQUELINE [ S0201538]",GYN ONCOLOGY,Oncology,A request has been sent to this email.Thank you!,14.531250,"Healthcare Provider Name: Ganjoo, Kristen Noos...",,
3,24384,44402,254850612,2025-01-05,Non-urgent Medical Question,"Hi Jeremiah,Utkarsh here, we met last week and...","MYCHART, GENERIC [ T200162]","Hi Utkarsh,Thank you for reaching out. I'm tem...",LCHC MYHEALTH POOL,POOL 11193,...,248373790,"Hi Dr.Shah,How are you. I got blood test done ...","DELGADO, NICOLE [ S0367163]",FAMILY MEDICINE SAMARITAN LOS GATOS,Family Medicine,"Hi Korney, Dr Shah, Rina Biren is not in the o...",15.906250,"Healthcare Provider Name: Moon, Jeremiah Hyuk ...",Clinical Note for Encounter on 1/2/2025 Utkars...,"Hi Jeremiah, Utkarsh here, we met last week an..."
4,52838,166720,253812916,2024-12-19,Non-urgent Medical Question,"Hi all,Hope you are doing well!Two quick quest...","MONTEZ, ANDREA [ S0285483]","Hi Amanda,I believe Dr. Berube has reviewed yo...",CC HEME CLINICAL,POOL 10422,...,251735513,Hi Dr. EkekezieI hope you’re doing well. Thank...,"LOPEZ, STEPHANIE [ S0367952]",DIGESTIVE HEALTH CENTER - OPC,Gastroenterology,"Hi Jessica,I hope you're doing well too! For t...",20.406250,"Healthcare Provider Name: Berube, Caroline Dep...",Patient is a 31-year-old Ashkenazi Jewish fema...,"Hi all, Hope you are doing well! Two quick que..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,162577,173987,249867916,2024-12-19,RE:Scheduling Question,Good morning PetaCould you please inform the i...,"JENKINS, PATRICIA [ S0322590]","Good morning Patty,I'm so sorry to hear this n...","CCSB ITC SCHEDULING COORDINATOR CRM,","POOL 11051,",...,248026764,"Good morning Mrs.Jackson,Thank you for your me...","TUQUERO, KIRKPATRICK [ S0358789]",HEAD/NECK ONCOLOGY BW,Oncology,"Dear Ms. Jackson , You are receiving this mess...",19.500000,"Healthcare Provider Name: Morris, Alison Joy D...",Edwin Morrison is an 80 year old male with a h...,"Peta, Thank you for scheduling Ed.'s appointme..."
96,51014,159295,253876392,2024-12-20,Test Results Question,I received my MRI results of my hip. I’d like ...,"ORTEGA, STACEY [ S0099441]",Hi CaroleThey are referred to the finding on t...,WCC GYN ONC CLINICAL,POOL 1153450987,...,254648090,"Alan,Happy New Year. This is in regard to my h...","MYCHART, GENERIC [ T200162]",INTERNAL MEDICINE MMC MENLO PARK CRANE,Internal Medicine,"Hi Theodore,Thank you for your response. I rev...",21.359375,"Healthcare Provider Name: Dorigo, Oliver Depar...",Patient is a 69-year-old female with a history...,I received my MRI results of my hip. I’d like ...
97,3830,223678,255645110,2025-01-14,RE:Visit Follow-up Question,Thank you for the recommendations. I am defini...,"FRANCIS, RAGNI [ S0065581]",The COVID home tests are good so I would just ...,"HUIE, DANIEL",S0027954,...,252998191,"Thank you for the recommendations, Dr. Mein. T...","PAK, JENNIFER [ S0322792]",GI MMC MENLO PARK MID,Gastroenterology,"Hi Sofia,Unfortunately, I am triple over booke...",18.953125,"Healthcare Provider Name: Huie, Daniel Allen D...",Clinical Note for Encounter on 11/8/2023 HISTO...,Thank you for keeping me updated on your sympt...
98,80329,290398,252668581,2024-12-09,Ordered Test Question,"Hi Dr. Fresco,I have my annual check with you ...","MYCHART, GENERIC [ T200162]","Hi Sharon,Thank you for your response and chec...",MMC INT MED MYHEALTH CLINIC MESSAGING,POOL 2100000438,...,249295871,"Hi Dr. Suffoletto, I am scheduled to see you f...","MYCHART, GENERIC [ T200162]",STANFORD INTERNAL MEDICINE WEST,Primary Care,"Hello, Meg-Congratulations! Thanks for reachin...",20.937500,"Healthcare Provider Name: Fresco, Linda Hannah...",Clinical Note for Encounter on 10/23/2023 Shar...,"Hi Dr. Fresco, I have my annual check with you..."
