# Import

In [6]:
import re
import ast
import time
import random
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import sent_tokenize

import sys
sys.path.append("../src")
import util.preprocessing_util as util

Download tokenizer

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/paul.schmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Setup

In [114]:
# Directories and constants
VLLM_URL       = "http://localhost:8000/v1/completions"
PROMPTS_DIR    = Path("prompts")
DATA_DIR       = Path("../data")
NOTES_DIR      = DATA_DIR / "synthetic" / "note-excerpts"
OUTPUT_DIR     = DATA_DIR / "synthetic" / "questions"
PROMPT_FILE    = PROMPTS_DIR / "generate_questions.txt"
NOTE_FILE_NAME = "few_shot_gpt4_separated_V2.csv"

# Ensure output dir exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [115]:
prompt_template = PROMPT_FILE.read_text()
note_file       = NOTES_DIR / NOTE_FILE_NAME
notes           = pd.read_csv(note_file)

In [116]:
print(f"Number of Notes: {len(notes)}")

Number of Notes: 1200


In [117]:
notes.head()

Unnamed: 0,note_excerpt
0,plan patient is a 67yearold female with past m...
1,diagnosis and assessment 64yearold female wit...
2,brief hospital course | the patient was admitt...
3,brief hospital course the patient was admitte...
4,procedures and operations right femoral-poplit...


Apply standard text cleaning

In [118]:
notes.iloc[0]["note_excerpt"]

'plan patient is a 67yearold female with past medical history significant for atrial fibrillation on coumadin presented with acute heart failure exacerbation| hospital course patient was admitted with complaints of increasing dyspnea orthopnea and lower extremity swelling over the past week| on admission patient was found to be in rapid afib with rates in the 150s and bnp significantly elevated| she was diuresed with iv lasix with significant improvement in symptoms and reduction in weight by approximately 3 kg| rate control was attempted with metoprolol but patient remained in rapid afib and was therefore cardioverted on hospital day 3| postcardioversion the patient was in sinus rhythm and her coumadin was resumed| during hospitalization an echo was performed revealing an ef of 30% with moderate mitral regurgitation| a heart failure consult was obtained and patient was started on an ace inhibitor carvedilol and spironolactone| discharge medication list includes coumadin dose to be man

**Clean notes**

In [119]:
def clean_note_text(text: str) -> str:
    text = text.strip()
    text = text.replace('\\n', '\n')               # un-escape newlines
    text = re.sub(r'\n{2,}', '\n', text)           # collapse multiple newlines
    text = re.sub(r'\*{2}(.*?)\*{2}', r'\1', text) # remove **bold**
    return text

In [120]:
notes['note_excerpt'] = notes['note_excerpt'].apply(clean_note_text)

**Tokenize on sentence level**

In [121]:
def split_sentences_by_line(note: str, delimiter="|") -> list[str]:
    lines = note.split(delimiter)
    return [l.strip() for l in lines if l.strip()]

In [122]:
notes["sentences"] = notes["note_excerpt"].apply(split_sentences_by_line)

In [123]:
test = notes.iloc[0]

In [124]:
test.sentences

['plan patient is a 67yearold female with past medical history significant for atrial fibrillation on coumadin presented with acute heart failure exacerbation',
 'hospital course patient was admitted with complaints of increasing dyspnea orthopnea and lower extremity swelling over the past week',
 'on admission patient was found to be in rapid afib with rates in the 150s and bnp significantly elevated',
 'she was diuresed with iv lasix with significant improvement in symptoms and reduction in weight by approximately 3 kg',
 'rate control was attempted with metoprolol but patient remained in rapid afib and was therefore cardioverted on hospital day 3',
 'postcardioversion the patient was in sinus rhythm and her coumadin was resumed',
 'during hospitalization an echo was performed revealing an ef of 30% with moderate mitral regurgitation',
 'a heart failure consult was obtained and patient was started on an ace inhibitor carvedilol and spironolactone',
 'discharge medication list include

# Build few‐shot examples

In [125]:
arch_csv = DATA_DIR / "dev" / "processed" / "medical_data.csv"
arch_data = pd.read_csv(arch_csv)

In [126]:
arch_data.head()

Unnamed: 0,case_id,patient_question,clinician_question,note_excerpt,sentences,sentence_text,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,Brief Hospital Course: During the ERCP a pancr...,"['Brief Hospital Course:', 'During the ERCP a ...",Brief Hospital Course:,"[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,2,dad given multiple shots of lasciks after he w...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course: Acute diastolic heart f...,"['Brief Hospital Course:', 'Acute diastolic he...",Brief Hospital Course:,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
2,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,Discharge Instructions: You were admitted to t...,['Discharge Instructions: You were admitted to...,Discharge Instructions: You were admitted to t...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
3,4,My doctor performed a cardiac catherization.,Why was cardiac catheterization recommended to...,History of Present Illness: On the cardiology ...,"['History of Present Illness:', 'On the cardio...",History of Present Illness:,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"I overdosed October 4th on trihexyphenidyl, th...",Is the pain connected to the overdose or somet...,"Brief Hospital Course: # Bipolar d/o, PTSD, sc...","['Brief Hospital Course:', ""# Bipolar d/o, PTS...",Brief Hospital Course:,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."


Sentences and labels are still stringified lists --> back to actual Python lists

In [127]:
arch_data["sentences"] = arch_data["sentences"].apply(ast.literal_eval)
arch_data["labels"] = arch_data["labels"].apply(ast.literal_eval)

In [128]:
# select a few handpicked cases
case_ids = [1, 3, 14, 19]
num_examples = 4
few = (arch_data[arch_data.case_id.isin(case_ids)]
       .groupby('case_id')
       .apply(lambda df: pd.Series({
           'question': df.patient_question.iloc[0],
           'clinician_question': df.clinician_question.iloc[0],
           'sentences': df.sentences.iloc[0],
           'labels': df.labels.iloc[0]
       }))
       .reset_index()
       .head(num_examples))

  .apply(lambda df: pd.Series({


In [129]:
few

Unnamed: 0,case_id,question,clinician_question,sentences,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,"[Brief Hospital Course:, During the ERCP a pan...","[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,[Discharge Instructions: You were admitted to ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
2,14,What would you say about cancer in the stomach?,Was there any evidence for stomach cancer?,[Discharge Instructions: You were admitted to ...,"[1, 1, 0, 0, 0, 0, 0, 1, 1]"
3,19,I went to ER for a bladder infection. The doct...,Are her symptoms related to anxiety or cardiov...,"[Discharge Instructions:, Why was I admitted t...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


# Cleaning for Few-Shot Prompt

In [130]:
def clean_question(raw: str) -> str:
    """
    - Strip leading/trailing whitespace and stray newlines
    - Collapse any internal newlines to spaces
    - Capitalize the first character
    - Remove any trailing punctuation (., !, ?)
    - Add exactly one question mark at the end
    """
    # strip & collapse newlines
    s = raw.strip().replace("\n", " ")
    if not s:
        return s
    # normalize spaces
    s = re.sub(r"\s+", " ", s)
    # capitalize first char
    s = s[0].upper() + s[1:]
    # strip trailing punctuation
    s = re.sub(r"[?\.!]+$", "", s)
    # append single question mark
    return s + "?"

def clean_sentence(raw: str) -> str:
    return re.sub(r"\s+", " ", raw.strip())

In [131]:
few["question"]           = few["question"].apply(clean_question)
few["clinician_question"] = few["clinician_question"].apply(clean_question)
few["sentences"]          = few["sentences"].apply(lambda lst: [clean_sentence(s) for s in lst])

In [132]:
few.head()

Unnamed: 0,case_id,question,clinician_question,sentences,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,"[Brief Hospital Course:, During the ERCP a pan...","[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,3,He is continously irritated and has headache w...,What is the expected course of recovery for him?,[Discharge Instructions: You were admitted to ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
2,14,What would you say about cancer in the stomach?,Was there any evidence for stomach cancer?,[Discharge Instructions: You were admitted to ...,"[1, 1, 0, 0, 0, 0, 0, 1, 1]"
3,19,I went to ER for a bladder infection. The doct...,Are her symptoms related to anxiety or cardiov...,"[Discharge Instructions:, Why was I admitted t...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [133]:
for i, q in enumerate(few.question):
    print(f"{i+1}: {q}")

1: My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
2: He is continously irritated and has headache when awake what do ido?
3: What would you say about cancer in the stomach?
4: I went to ER for a bladder infection. The doctor asked if I had irregular heartbeats or a-fib, which lead me getting very panicky?


In [134]:
for i, sentence in enumerate(few.sentences[3]):
    print(f"{i+1}: {sentence}")

1: Discharge Instructions:
2: Why was I admitted to the hospital?
3: You were admitted to the hospital because you were having chest pain.
4: We wanted to rule out a ___ cause.
5: What happened while I was here?
6: - We did a very thorough work up to make sure your heart isn't the cause of your chest pain.
7: This included EKGs, blood tests that look at whether the heart is under stress, and a CAT scan to make sure you didn't have a tear in any major blood vessels or a blood clot in your lungs.
8: These were all normal.
9: You did a stress test and another special heart study called a MIBI, which did not show any heart problems.
10: - We gave you Ativan and Tylenol, which improved your pain.
11: - We also started you on a medicine called sertraline which helps with anxiety, which might be the cause of your chest pain.
12: - We checked your thyroid function which was normal.
13: - We checked your cholesterol which was normal.
14: What should I do when I get home?
15: - Continue taking a

**Format few‐shot block**

In [135]:
def format_few_shot(df: pd.DataFrame) -> str:
    exs = []
    for _, row in df.iterrows():
        numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
        relevant = [i+1 for i, lbl in enumerate(row.labels) if lbl]
        exs.append(
            f"Note Excerpt:\n{numbered}\n\n"
            f"Patient Question: {row.question}\n"
            f"Clinician Question: {row.clinician_question}\n"
            f"Relevant Sentences: {relevant}"
        )
    return "\n\n---\n\n".join(exs)

few_shot_block = format_few_shot(few)

# Build Prompt

In [136]:
prompt_template = (PROMPTS_DIR/"generate_questions.txt").read_text()

In [137]:
prompt_template

'Here are examples on how to turn a clinical note excerpt into a patient‑style question, a clinician‑style question, and the sentences needed to answer it:\n\n{example_qas}\n\n---\n\nNow you are given a new clinical note excerpt from a patient’s electronic health record (EHR):\n\n---\n{note}\n---\n\nYour task is to:\n1. Write exactly one realistic patient‑style question someone might ask after reading this note.\n2. Rewrite that question in a formal, clinician‑friendly format.\n3. Identify which sentences in the note are directly relevant to answering the question. Use sentence numbers starting from 1 and only include the ones you need.\n\nOnly include sentences you would quote when directly answering the patient’s question—omit any that merely describe admission details, lab results, or context unless those facts are literally part of your answer.\nDon’t repeat the note itself in your answer—only output the three fields, exactly in this format:\n\nPatient Question: <your patient quest

# Generate

**Helper functions**

In [138]:
def is_valid_generation(text):
    t = text.lower()
    return ("patient question:" in t
            and "clinician question:" in t
            and "relevant sentences" in t
            and "[" in t and "]" in t)

def generate_from_vllm(prompt: str,
                       temperature: float = 0.7,
                       max_tokens: int = 1024,
                       retries: int = 3,
                       delay: int = 2) -> str:
    payload = {
        "prompt":     prompt,
        "temperature": temperature,
        "max_tokens":  max_tokens,
        "top_p":       0.9,
    }
    for attempt in range(retries):
        try:
            r = requests.post(VLLM_URL, json=payload)
            r.raise_for_status()            
            return r.json()["choices"][0]["text"].strip()        
        except Exception as e:
            print(f"[Warning] Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return ""

In [139]:
def clean_output(raw: str) -> str:
    # Split at the first occurrence of “Patient Question:”
    sep = "Patient Question:"
    parts = raw.split(sep, 1)
    if len(parts) < 2:
        raise ValueError("No Patient Question: in output")
    return sep + " " + parts[1].strip()

**Quick sanity check**

In [140]:
notes.head()

Unnamed: 0,note_excerpt,sentences
0,plan patient is a 67yearold female with past m...,[plan patient is a 67yearold female with past ...
1,diagnosis and assessment 64yearold female wit...,[diagnosis and assessment 64yearold female wi...
2,brief hospital course | the patient was admitt...,"[brief hospital course, the patient was admitt..."
3,brief hospital course the patient was admitte...,[brief hospital course the patient was admitt...
4,procedures and operations right femoral-poplit...,[procedures and operations right femoral-popli...


In [144]:
# Generate questions for each note
outputs = []
for idx, row in tqdm(notes.iterrows(), total=len(notes), desc="Generating"):
    numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
    prompt = prompt_template.format(
        example_qas=few_shot_block,
        note=numbered
    )
    gen = generate_from_vllm(prompt)

    if is_valid_generation(gen):
        outputs.append({
            "id":           idx,
            "note_excerpt": row.note_excerpt,
            "sentences":    row.sentences,
            "output":       clean_output(gen)
        })
    else:
        print(f"[Warning] Invalid output for idx {idx}")
        print("Generation: " + gen)

Generating:   0%|          | 0/1200 [00:00<?, ?it/s]

Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 
Generation: 

# Quick sanity check

In [145]:
print(f"Number of gen output: {len(outputs)}")

Number of gen output: 1074


In [152]:
example = outputs[2]

In [153]:
for i, sentence in enumerate(example["sentences"]):
    print(f"{i+1}: {sentence}")

1: brief hospital course  the patient was admitted with severe dehydration secondary to persistent vomiting and diarrhea which began after consuming spoiled food
2: despite initial fluid resuscitation with normal saline and electrolyte replacement, the patient’s renal function continued to decline
3: creatinine peaked at 3.2 on hospital day 3
4: nephrology was consulted and recommended starting the patient on intravenous fluids with bicarbonate as well as adjusting the electrolyte replacement to address ongoing metabolic acidosis
5: repeated lab work showed improvement in renal function by hospital day 5 with creatinine trending down to 2.1
6: patient was monitored closely for signs of refeeding syndrome and hypophosphatemia was noted on day 4, for which IV phosphorus was administered
7: discharge instructions included avoidance of high potassium and phosphorus foods, and to maintain adequate hydration
8: patient to follow up with nephrology and primary care physician to reassess renal

In [154]:
print("Generated QA:\n\n", example["output"])

Generated QA:

 Patient Question: Why did I need IV fluids with bicarbonate?
Clinician Question: What was the rationale for initiating intravenous fluids with bicarbonate in this patient?
Relevant Sentences: [4]


In [155]:
print("Note excerpt:\n", example["note_excerpt"], "\n")
print("Generated QA:\n", example["output"])

Note excerpt:
 brief hospital course  the patient was admitted with severe dehydration secondary to persistent vomiting and diarrhea which began after consuming spoiled food | despite initial fluid resuscitation with normal saline and electrolyte replacement, the patient’s renal function continued to decline | creatinine peaked at 3.2 on hospital day 3 | nephrology was consulted and recommended starting the patient on intravenous fluids with bicarbonate as well as adjusting the electrolyte replacement to address ongoing metabolic acidosis | repeated lab work showed improvement in renal function by hospital day 5 with creatinine trending down to 2.1 | patient was monitored closely for signs of refeeding syndrome and hypophosphatemia was noted on day 4, for which IV phosphorus was administered | discharge instructions included avoidance of high potassium and phosphorus foods, and to maintain adequate hydration | patient to follow up with nephrology and primary care physician to reassess 

# Post Processing 

In [165]:
df_questions = pd.DataFrame(outputs)

**Rename the output column** 

In [166]:
df_questions = df_questions.rename(columns={'output':'question'})

**Remove Seperators from notes**

In [167]:
df_questions['note_excerpt'] = df_questions['note_excerpt'].str.replace(r'\|', '', regex=True)

In [168]:
df_questions.head()

Unnamed: 0,id,note_excerpt,sentences,question
0,0,plan patient is a 67yearold female with past m...,[plan patient is a 67yearold female with past ...,Patient Question: I feel like I was shocked ba...
1,2,brief hospital course the patient was admitte...,"[brief hospital course, the patient was admitt...",Patient Question: Is my heart surgery still on...
2,3,brief hospital course the patient was admitte...,[brief hospital course the patient was admitt...,Patient Question: Why did I need IV fluids wit...
3,4,procedures and operations right femoral-poplit...,[procedures and operations right femoral-popli...,Patient Question: I have a fever after the sur...
4,5,chief complaint worsening shortness of breath ...,[chief complaint worsening shortness of breath...,Patient Question: I have diabetes and they gav...


**Split question into: Patient Q, Clinician Q and Relevant Sentences**

In [169]:
df_questions[
    ['patient_question','clinician_question','relevant_sentences']
] = df_questions['question'].str.extract(
    r'Patient Question:\s*(.*?)\n'
    r'Clinician Question:\s*(.*?)\n'
    r'Relevant Sentences:\s*(.*)'
)

**Drop old question column**

In [170]:
df_questions = df_questions.drop(columns='question')

**Convert relevant_sentences from decimal to binary** 

In [171]:
# parse the literal list‐strings into actual lists
df_questions['relevant_sentences'] = (
    df_questions['relevant_sentences']
      .apply(ast.literal_eval)
)

In [172]:
df_questions['labels'] = df_questions.apply(
  lambda row: [
    1 if (i+1) in row['relevant_sentences'] else 0
    for i in range(len(row['sentences']))
  ],
  axis=1
)

**Drop old relevant sentences column**

In [173]:
df_questions = df_questions.drop(columns='relevant_sentences')

**Clean generated text columns**

In [174]:
text_columns = ["patient_question", "clinician_question", "note_excerpt"]
list_columns = ["sentences"]

In [175]:
df_questions = util.clean_text_df(df_questions, text_columns = text_columns, list_columns = list_columns)

**Reorder columns to improve readability**

In [176]:
df_questions = df_questions[[
    "patient_question",
    "clinician_question",
    "sentences",
    "note_excerpt",
    "labels"
]]

**Last quick check**

In [177]:
df_questions.head()

Unnamed: 0,patient_question,clinician_question,sentences,note_excerpt,labels
0,I feel like I was shocked back to normal. What...,What is the rationale for initiating rate cont...,[plan patient is a 67yearold female with past ...,plan patient is a 67yearold female with past m...,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
1,Is my heart surgery still on the table or will...,What is the current plan regarding coronary ar...,"[brief hospital course, the patient was admitt...",brief hospital course the patient was admitted...,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
2,Why did I need IV fluids with bicarbonate?,What was the rationale for initiating intraven...,[brief hospital course the patient was admitte...,brief hospital course the patient was admitted...,"[0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,"I have a fever after the surgery, should I be ...",What is the etiology of the patient’s post-ope...,[procedures and operations right femoral-popli...,procedures and operations right femoral-poplit...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]"
4,"I have diabetes and they gave me steroids, wil...",How was the patient's glycemic control managed...,[chief complaint worsening shortness of breath...,chief complaint worsening shortness of breath ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"


# Save

In [178]:
out_file  = OUTPUT_DIR / "generated_questions_RUN02.csv"
df_questions.to_csv(out_file, index=False)
print(f"Saved to: {out_file}")

Saved to: ../data/synthetic/questions/generated_questions_RUN02.csv
