# Import

In [2]:
import re
import ast
import time
import random
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import sent_tokenize

import sys
sys.path.append("../src")
import util.preprocessing_util as util

Download tokenizer

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/paul.schmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Setup

In [4]:
# Directories and constants
VLLM_URL       = "http://localhost:8000/v1/completions"
PROMPTS_DIR    = Path("prompts")
DATA_DIR       = Path("../data")
NOTES_DIR      = DATA_DIR / "synthetic" / "note-excerpts"
OUTPUT_DIR     = DATA_DIR / "synthetic" / "questions"
PROMPT_FILE    = PROMPTS_DIR / "generate_questions.txt"
NOTE_FILE_NAME = "few_shot_gpt4_separated.csv"

# Ensure output dir exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
prompt_template = PROMPT_FILE.read_text()
note_file       = NOTES_DIR / NOTE_FILE_NAME
notes           = pd.read_csv(note_file)

In [5]:
notes.head()

Unnamed: 0,note_excerpt
0,surgical pathology consult the patient a 72yea...
1,admission date the patient arrived on 471830 w...
2,postoperative assessment patient underwent exp...
3,### Impression and Plan\n\npatient is a 65year...
4,physical exam on admission | the patient is a ...


Apply standard text cleaning

In [6]:
notes.iloc[0]["note_excerpt"]

'surgical pathology consult the patient a 72yearold female presented with hematuria and was found to have a 6 cm complex cystic lesion on her right kidney via ct scan | history includes stage 2 breast cancer sp lumpectomy and adjuvant radiation in 7191 | the renal mass raised suspicion for malignancy and a nephrectomy was advised by her oncologist dr | pathology report confirmed clear cell renal carcinoma grade 2 with negative margins | patient was then referred to medical oncology for adjuvant therapy considerations | brief hospital course patient was admitted for elective right radical nephrectomy performed by urology on | intraoperatively no significant complications occurred | postoperative recovery was uneventful and the patient was advanced to a regular diet by pod 2 | on pod 3 patient developed a fever of 38.5c and tachycardia | blood cultures were positive for klebsiella pneumoniae and pt was started on iv ceftriaxone | fever resolved by pod 5 and patient completed a full 7day 

**Clean notes**

In [7]:
def clean_note_text(text: str) -> str:
    text = text.strip()
    text = text.replace('\\n', '\n')               # un-escape newlines
    text = re.sub(r'\n{2,}', '\n', text)           # collapse multiple newlines
    text = re.sub(r'\*{2}(.*?)\*{2}', r'\1', text) # remove **bold**
    return text

In [8]:
notes['cleaned'] = notes['note_excerpt'].apply(clean_note_text)

**Tokenize on sentence level**

In [9]:
def split_sentences_by_line(note: str, delimiter="|") -> list[str]:
    lines = note.split(delimiter)
    return [l.strip() for l in lines if l.strip()]

In [10]:
notes["sentences"] = notes["note_excerpt"].apply(split_sentences_by_line)

In [11]:
test = notes.iloc[0]

In [12]:
test.sentences

['surgical pathology consult the patient a 72yearold female presented with hematuria and was found to have a 6 cm complex cystic lesion on her right kidney via ct scan',
 'history includes stage 2 breast cancer sp lumpectomy and adjuvant radiation in 7191',
 'the renal mass raised suspicion for malignancy and a nephrectomy was advised by her oncologist dr',
 'pathology report confirmed clear cell renal carcinoma grade 2 with negative margins',
 'patient was then referred to medical oncology for adjuvant therapy considerations',
 'brief hospital course patient was admitted for elective right radical nephrectomy performed by urology on',
 'intraoperatively no significant complications occurred',
 'postoperative recovery was uneventful and the patient was advanced to a regular diet by pod 2',
 'on pod 3 patient developed a fever of 38.5c and tachycardia',
 'blood cultures were positive for klebsiella pneumoniae and pt was started on iv ceftriaxone',
 'fever resolved by pod 5 and patient c

# Build few‐shot examples

In [13]:
arch_csv = DATA_DIR / "dev" / "processed" / "medical_data.csv"
arch_data = pd.read_csv(arch_csv)

In [14]:
# select a few handpicked cases
case_ids = [1, 3, 14, 19]
num_examples = 4
few = (arch_data[arch_data.case_id.isin(case_ids)]
       .groupby('case_id')
       .apply(lambda df: pd.Series({
           'question': df.patient_question.iloc[0],
           'clinician_question': df.clinician_question.iloc[0],
           'sentences': df.sentence_text.tolist(),
           'labels': [1 if r in ['essential','relevant'] else 0 for r in df.relevance]
       }))
       .reset_index()
       .head(num_examples))

  .apply(lambda df: pd.Series({


In [15]:
few

Unnamed: 0,case_id,question,clinician_question,sentences,labels
0,1,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,"[brief hospital course, during the ercp a panc...","[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,3,he is continously irritated and has headache w...,what is the expected course of recovery for him,[discharge instructions you were admitted to t...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
2,14,what would you say about cancer in the stomach,was there any evidence for stomach cancer,[discharge instructions you were admitted to t...,"[1, 1, 0, 0, 0, 0, 0, 1, 1]"
3,19,i went to er for a bladder infection the docto...,are her symptoms related to anxiety or cardiov...,"[discharge instructions, why was i admitted to...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


# Cleaning for Few-Shot Prompt

In [16]:
def clean_question(raw: str) -> str:
    """
    - Strip leading/trailing whitespace and stray newlines
    - Collapse any internal newlines to spaces
    - Capitalize the first character
    - Remove any trailing punctuation (., !, ?)
    - Add exactly one question mark at the end
    """
    # strip & collapse newlines
    s = raw.strip().replace("\n", " ")
    if not s:
        return s
    # normalize spaces
    s = re.sub(r"\s+", " ", s)
    # capitalize first char
    s = s[0].upper() + s[1:]
    # strip trailing punctuation
    s = re.sub(r"[?\.!]+$", "", s)
    # append single question mark
    return s + "?"

def clean_sentence(raw: str) -> str:
    return re.sub(r"\s+", " ", raw.strip())

In [17]:
few["question"]           = few["question"].apply(clean_question)
few["clinician_question"] = few["clinician_question"].apply(clean_question)
few["sentences"]          = few["sentences"].apply(lambda lst: [clean_sentence(s) for s in lst])

Fix typo manually 

In [18]:
few.loc[1, "question"] = "He is continuously irritated and has a headache when awake. What should I do?"

In [19]:
for i, q in enumerate(few.question):
    print(f"{i+1}: {q}")

1: My question is if the sludge was there does not the medication help in flushing it out whether ercp was the only cure?
2: He is continuously irritated and has a headache when awake. What should I do?
3: What would you say about cancer in the stomach?
4: I went to er for a bladder infection the doctor asked if i had irregular heartbeats or afib which lead me getting very panicky?


In [20]:
for i, sentence in enumerate(few.sentences[3]):
    print(f"{i+1}: {sentence}")

1: discharge instructions
2: why was i admitted to the hospital
3: you were admitted to the hospital because you were having chest pain
4: we wanted to rule out a cause
5: what happened while i was here
6: we did a very thorough work up to make sure your heart isnt the cause of your chest pain
7: this included ekgs blood tests that look at whether the heart is under stress and a cat scan to make sure you didnt have a tear in any major blood vessels or a blood clot in your lungs
8: these were all normal
9: you did a stress test and another special heart study called a mibi which did not show any heart problems
10: we gave you ativan and tylenol which improved your pain
11: we also started you on a medicine called sertraline which helps with anxiety which might be the cause of your chest pain
12: we checked your thyroid function which was normal
13: we checked your cholesterol which was normal
14: what should i do when i get home
15: continue taking all your normal medications and the ne

**Format few‐shot block**

In [21]:
def format_few_shot(df: pd.DataFrame) -> str:
    exs = []
    for _, row in df.iterrows():
        numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
        relevant = [i+1 for i, lbl in enumerate(row.labels) if lbl]
        exs.append(
            f"Note Excerpt:\n{numbered}\n\n"
            f"Patient Question: {row.question}\n"
            f"Clinician Question: {row.clinician_question}\n"
            f"Relevant Sentences: {relevant}"
        )
    return "\n\n---\n\n".join(exs)

few_shot_block = format_few_shot(few)

# Build Prompt

In [22]:
prompt_template = (PROMPTS_DIR/"generate_questions.txt").read_text()

In [23]:
prompt_template

'Here are examples on how to turn a clinical note excerpt into a patient‑style question, a clinician‑style question, and the sentences needed to answer it:\n\n{example_qas}\n\n---\n\nNow you are given a new clinical note excerpt from a patient’s electronic health record (EHR):\n\n---\n{note}\n---\n\nYour task is to:\n1. Write exactly one realistic patient‑style question someone might ask after reading this note.\n2. Rewrite that question in a formal, clinician‑friendly format.\n3. Identify which sentences in the note are directly relevant to answering the question. Use sentence numbers starting from 1 and only include the ones you need.\n\nOnly include sentences you would quote when directly answering the patient’s question—omit any that merely describe admission details, lab results, or context unless those facts are literally part of your answer.\nDon’t repeat the note itself in your answer—only output the three fields, exactly in this format:\n\nPatient Question: <your patient quest

# Generate

**Helper functions**

In [24]:
def is_valid_generation(text):
    t = text.lower()
    return ("patient question:" in t
            and "clinician question:" in t
            and "relevant sentences" in t
            and "[" in t and "]" in t)

def generate_from_vllm(prompt: str,
                       temperature: float = 0.7,
                       max_tokens: int = 1024,
                       retries: int = 3,
                       delay: int = 2) -> str:
    payload = {
        "prompt":     prompt,
        "temperature": temperature,
        "max_tokens":  max_tokens,
        "top_p":       0.9,
    }
    for attempt in range(retries):
        try:
            r = requests.post(VLLM_URL, json=payload)
            r.raise_for_status()            
            return r.json()["choices"][0]["text"].strip()        
        except Exception as e:
            print(f"[Warning] Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return ""

In [25]:
def clean_output(raw: str) -> str:
    # Split at the first occurrence of “Patient Question:”
    sep = "Patient Question:"
    parts = raw.split(sep, 1)
    if len(parts) < 2:
        raise ValueError("No Patient Question: in output")
    return sep + " " + parts[1].strip()

In [26]:
notes_test = notes

In [27]:
# Generate questions for each note
outputs = []
for idx, row in tqdm(notes_test.iterrows(), total=len(notes_test), desc="Generating"):
    numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
    prompt = prompt_template.format(
        example_qas=few_shot_block,
        note=numbered
    )
    gen = generate_from_vllm(prompt)

    if is_valid_generation(gen):
        outputs.append({
            "id":           idx,
            "note_excerpt": row.cleaned,
            "sentences":    row.sentences,
            "output":       clean_output(gen)
        })
    else:
        print(f"[Warning] Invalid output for idx {idx}")
        print("Generation: " + gen)

Generating:   0%|          | 0/300 [00:00<?, ?it/s]

# Quick sanity check

In [28]:
print(f"Number of gen output: {len(outputs)}")

Number of gen output: 300


In [29]:
example = outputs[0]

In [30]:
for i, sentence in enumerate(example["sentences"]):
    print(f"{i+1}: {sentence}")

1: surgical pathology consult the patient a 72yearold female presented with hematuria and was found to have a 6 cm complex cystic lesion on her right kidney via ct scan
2: history includes stage 2 breast cancer sp lumpectomy and adjuvant radiation in 7191
3: the renal mass raised suspicion for malignancy and a nephrectomy was advised by her oncologist dr
4: pathology report confirmed clear cell renal carcinoma grade 2 with negative margins
5: patient was then referred to medical oncology for adjuvant therapy considerations
6: brief hospital course patient was admitted for elective right radical nephrectomy performed by urology on
7: intraoperatively no significant complications occurred
8: postoperative recovery was uneventful and the patient was advanced to a regular diet by pod 2
9: on pod 3 patient developed a fever of 38.5c and tachycardia
10: blood cultures were positive for klebsiella pneumoniae and pt was started on iv ceftriaxone
11: fever resolved by pod 5 and patient complete

In [31]:
print("Generated QA:\n\n", example["output"])

Generated QA:

 Patient Question: I had surgery for a kidney tumor and then got a fever. Was the fever related to the surgery, or something else?
Clinician Question: What was the etiology of the patient’s postoperative fever and how was it managed?
Relevant Sentences: [9, 10, 11, 12]


In [35]:
print("Note excerpt:\n", example["note_excerpt"], "\n")
print("Generated QA:\n", example["output"])

Note excerpt:
 surgical pathology consult the patient a 72yearold female presented with hematuria and was found to have a 6 cm complex cystic lesion on her right kidney via ct scan | history includes stage 2 breast cancer sp lumpectomy and adjuvant radiation in 7191 | the renal mass raised suspicion for malignancy and a nephrectomy was advised by her oncologist dr | pathology report confirmed clear cell renal carcinoma grade 2 with negative margins | patient was then referred to medical oncology for adjuvant therapy considerations | brief hospital course patient was admitted for elective right radical nephrectomy performed by urology on | intraoperatively no significant complications occurred | postoperative recovery was uneventful and the patient was advanced to a regular diet by pod 2 | on pod 3 patient developed a fever of 38.5c and tachycardia | blood cultures were positive for klebsiella pneumoniae and pt was started on iv ceftriaxone | fever resolved by pod 5 and patient complete

# Post Processing 

**Rename the output column** 

In [110]:
df_questions = df_questions.rename(columns={'output':'question'})

**Remove Seperators from notes**

In [111]:
df_questions['note_excerpt'] = df_questions['note_excerpt'].str.replace(r'\|', '', regex=True)

In [112]:
df_questions.head()

Unnamed: 0,id,note_excerpt,sentences,question
0,0,surgical pathology consult the patient a 72yea...,[surgical pathology consult the patient a 72ye...,Patient Question: I had surgery for a kidney t...
1,1,admission date the patient arrived on 471830 w...,[admission date the patient arrived on 471830 ...,Patient Question: I have kidney problems and h...
2,2,postoperative assessment patient underwent exp...,[postoperative assessment patient underwent ex...,Patient Question: I am worried about infection...
3,3,### Impression and Plan\npatient is a 65yearol...,[### Impression and Plan\n\npatient is a 65yea...,Patient Question: Why do I have to take so man...
4,4,physical exam on admission the patient is a 7...,"[physical exam on admission, the patient is a ...",Patient Question: I am confused about all thes...


**Split question into: Patient Q, Clinician Q and Relevant Sentences**

In [113]:
df_questions[
    ['patient_question','clinician_question','relevant_sentences']
] = df_questions['question'].str.extract(
    r'Patient Question:\s*(.*?)\n'
    r'Clinician Question:\s*(.*?)\n'
    r'Relevant Sentences:\s*(.*)'
)

**Drop old question column**

In [114]:
df_questions = df_questions.drop(columns='question')

**Convert relevant_sentences from decimal to binary** 

In [115]:
# parse the literal list‐strings into actual lists
df_questions['relevant_sentences'] = (
    df_questions['relevant_sentences']
      .apply(ast.literal_eval)
)

In [116]:
df_questions['labels'] = df_questions.apply(
  lambda row: [
    1 if (i+1) in row['relevant_sentences'] else 0
    for i in range(len(row['sentences']))
  ],
  axis=1
)

**Drop old relevant sentences column**

In [124]:
df_questions = df_questions.drop(columns='relevant_sentences')

**Clean generated text columns**

In [None]:
text_columns = ["patient_question", "clinician_question", "note_excerpt"]
list_columns = ["sentences"]

In [None]:
df_questions = util.clean_text_df(df_questions, text_columns = text_columns, list_columns = list_columns)

**Reorder columns to improve readability**

In [14]:
df_questions = df_questions[[
    "patient_question",
    "clinician_question",
    "sentences",
    "note_excerpt",
    "labels"
]]

**Last quick check**

In [15]:
df_questions.head()

Unnamed: 0,patient_question,clinician_question,sentences,note_excerpt,labels
0,I had surgery for a kidney tumor and then got ...,What was the etiology of the patient’s postope...,[surgical pathology consult the patient a 72ye...,surgical pathology consult the patient a 72yea...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0]"
1,I have kidney problems and high blood pressure...,What is the relationship between the patient's...,[admission date the patient arrived on 471830 ...,admission date the patient arrived on 471830 w...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0]"
2,"I am worried about infection, and I live alone...",What are the key discharge instructions regard...,[postoperative assessment patient underwent ex...,postoperative assessment patient underwent exp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Why do I have to take so many medications?,What is the rationale for the patient's curren...,[### Impression and Plan patient is a 65yearol...,### Impression and Plan patient is a 65yearold...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,I am confused about all these heart and kidney...,What is the rationale for initiating sacubitri...,"[physical exam on admission, the patient is a ...",physical exam on admission the patient is a 74...,"[0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0]"


# Save

In [12]:
out_file  = OUTPUT_DIR / "generated_questions_RUN01.csv"
df_questions.to_csv(out_file, index=False)
print(f"Saved to: {out_file}")

Saved to: ../data/synthetic/questions/generated_questions_RUN01.csv
