# Import

In [61]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
import re
import ast
import time
import random
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import sent_tokenize
import openai

import sys
sys.path.append("../src")
sys.path.append("../configs")
import util.preprocessing_util as util
from hf_config import hf_token, openai_token

Download tokenizer

In [63]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/paul.schmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Setup

In [64]:
# Directories and constants
VLLM_URL       = "http://localhost:8000/v1/completions"
PROMPTS_DIR    = Path("prompts")
DATA_DIR       = Path("../data")
# NOTES_DIR      = DATA_DIR / "synthetic" / "note-excerpts"
NOTES_DIR      = DATA_DIR / "dev" / "processed"
OUTPUT_DIR     = DATA_DIR / "synthetic" / "questions"
PROMPT_FILE    = PROMPTS_DIR / "generate_questions.txt"
NOTE_FILE_NAME = "medical_data.csv"

# Ensure output dir exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [65]:
prompt_template = PROMPT_FILE.read_text()
note_file       = NOTES_DIR / NOTE_FILE_NAME
notes           = pd.read_csv(note_file)

In [66]:
print(f"Number of Notes: {len(notes)}")

Number of Notes: 20


In [67]:
print(prompt_template)

Here are examples on how to turn a clinical note excerpt into a patient‑style question, a clinician‑style question, and the sentences needed to answer it:

{example_qas}

---

Now you are given a new clinical note excerpt from a patient’s electronic health record (EHR):

---
{note}
---

Your task is to:
1. Write **exactly one** realistic patient‑style question someone might ask after reading this note.
2. Rewrite that question in a formal, clinician‑friendly format.
3. Identify which sentences in the note are **directly and literally** used to answer your question.

Don’t repeat the note itself in your answer—only output the three fields, exactly in this format:

Patient Question: <your patient question>  
Clinician Question: <your clinician question>  
Relevant Sentences: [<num1>, <num2>, …]


In [68]:
notes.head()

Unnamed: 0,case_id,patient_question,clinician_question,note_excerpt,sentences,sentence_text,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,Brief Hospital Course: During the ERCP a pancr...,"['Brief Hospital Course:', 'During the ERCP a ...",Brief Hospital Course:,"[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,2,dad given multiple shots of lasciks after he w...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course: Acute diastolic heart f...,"['Brief Hospital Course:', 'Acute diastolic he...",Brief Hospital Course:,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
2,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,Discharge Instructions: You were admitted to t...,['Discharge Instructions: You were admitted to...,Discharge Instructions: You were admitted to t...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
3,4,My doctor performed a cardiac catherization.,Why was cardiac catheterization recommended to...,History of Present Illness: On the cardiology ...,"['History of Present Illness:', 'On the cardio...",History of Present Illness:,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"I overdosed October 4th on trihexyphenidyl, th...",Is the pain connected to the overdose or somet...,"Brief Hospital Course: # Bipolar d/o, PTSD, sc...","['Brief Hospital Course:', ""# Bipolar d/o, PTS...",Brief Hospital Course:,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."


In [69]:
notes.iloc[0]["note_excerpt"]

"Brief Hospital Course: During the ERCP a pancreatic stent was required to facilitate access to the biliary system (removed at the end of the procedure), and a common bile duct stent was placed to allow drainage of the biliary obstruction caused by stones and sludge. However, due to the patient's elevated INR, no sphincterotomy or stone removal was performed. Frank pus was noted to be draining from the common bile duct, and post-ERCP it was recommended that the patient remain on IV Zosyn for at least a week. The Vancomycin was discontinued. On hospital day 4 (post-procedure day 3) the patient returned to ERCP for re-evaluation of her biliary stent as her LFTs and bilirubin continued an upward trend. On ERCP the previous biliary stent was noted to be acutely obstructed by biliary sludge and stones. As the patient's INR was normalized to 1.2, a sphincterotomy was safely performed, with removal of several biliary stones in addition to the common bile duct stent. At the conclusion of the p

In [70]:
def split_sentences_by_line(note: str, delimiter="|") -> list[str]:
    lines = note.split(delimiter)
    return [l.strip() for l in lines if l.strip()]

In [71]:
# notes["sentences"] = notes["note_excerpt"].apply(split_sentences_by_line)

# Build few‐shot examples

In [72]:
arch_csv = DATA_DIR / "dev" / "processed" / "medical_data.csv"
arch_data = pd.read_csv(arch_csv)

In [73]:
arch_data.head()

Unnamed: 0,case_id,patient_question,clinician_question,note_excerpt,sentences,sentence_text,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,Brief Hospital Course: During the ERCP a pancr...,"['Brief Hospital Course:', 'During the ERCP a ...",Brief Hospital Course:,"[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,2,dad given multiple shots of lasciks after he w...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course: Acute diastolic heart f...,"['Brief Hospital Course:', 'Acute diastolic he...",Brief Hospital Course:,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
2,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,Discharge Instructions: You were admitted to t...,['Discharge Instructions: You were admitted to...,Discharge Instructions: You were admitted to t...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
3,4,My doctor performed a cardiac catherization.,Why was cardiac catheterization recommended to...,History of Present Illness: On the cardiology ...,"['History of Present Illness:', 'On the cardio...",History of Present Illness:,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"I overdosed October 4th on trihexyphenidyl, th...",Is the pain connected to the overdose or somet...,"Brief Hospital Course: # Bipolar d/o, PTSD, sc...","['Brief Hospital Course:', ""# Bipolar d/o, PTS...",Brief Hospital Course:,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."


Sentences and labels are still stringified lists --> back to actual Python lists

In [74]:
arch_data["sentences"] = arch_data["sentences"].apply(ast.literal_eval)
arch_data["labels"] = arch_data["labels"].apply(ast.literal_eval)

In [75]:
# select a few handpicked cases
case_ids = [1, 3, 14, 19]
num_examples = 4
few = (arch_data[arch_data.case_id.isin(case_ids)]
       .groupby('case_id')
       .apply(lambda df: pd.Series({
           'patient_question': df.patient_question.iloc[0],
           'clinician_question': df.clinician_question.iloc[0],
           'sentences': df.sentences.iloc[0],
           'labels': df.labels.iloc[0]
       }))
       .reset_index()
       .head(num_examples))

  .apply(lambda df: pd.Series({


## Cleaning for Few-Shot Examples

In [76]:
few.head()

Unnamed: 0,case_id,patient_question,clinician_question,sentences,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,"[Brief Hospital Course:, During the ERCP a pan...","[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,[Discharge Instructions: You were admitted to ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
2,14,What would you say about cancer in the stomach?,Was there any evidence for stomach cancer?,[Discharge Instructions: You were admitted to ...,"[1, 1, 0, 0, 0, 0, 0, 1, 1]"
3,19,I went to ER for a bladder infection. The doct...,Are her symptoms related to anxiety or cardiov...,"[Discharge Instructions:, Why was I admitted t...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [77]:
few = util.clean_text_df(few, text_columns = ["patient_question", "clinician_question"])

In [78]:
few.head()

Unnamed: 0,case_id,patient_question,clinician_question,sentences,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,"[Brief Hospital Course:, During the ERCP a pan...","[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,[Discharge Instructions: You were admitted to ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
2,14,What would you say about cancer in the stomach?,Was there any evidence for stomach cancer?,[Discharge Instructions: You were admitted to ...,"[1, 1, 0, 0, 0, 0, 0, 1, 1]"
3,19,I went to ER for a bladder infection. The doct...,Are her symptoms related to anxiety or cardiov...,"[Discharge Instructions:, Why was I admitted t...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [79]:
for i, q in enumerate(few.patient_question):
    print(f"{i+1}: {q}")

1: My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
2: he is continously irritated and has headache when awake what do ido
3: What would you say about cancer in the stomach?
4: I went to ER for a bladder infection. The doctor asked if I had irregular heartbeats or a-fib, which lead me getting very panicky


In [80]:
for i, sentence in enumerate(few.sentences[3]):
    print(f"{i+1}: {sentence}")

1: Discharge Instructions:
2: Why was I admitted to the hospital?
3: You were admitted to the hospital because you were having chest pain.
4: We wanted to rule out a ___ cause.
5: What happened while I was here?
6: - We did a very thorough work up to make sure your heart isn't the cause of your chest pain.
7: This included EKGs, blood tests that look at whether the heart is under stress, and a CAT scan to make sure you didn't have a tear in any major blood vessels or a blood clot in your lungs.
8: These were all normal.
9: You did a stress test and another special heart study called a MIBI, which did not show any heart problems.
10: - We gave you Ativan and Tylenol, which improved your pain.
11: - We also started you on a medicine called sertraline which helps with anxiety, which might be the cause of your chest pain.
12: - We checked your thyroid function which was normal.
13: - We checked your cholesterol which was normal.
14: What should I do when I get home?
15: - Continue taking a

# Generate

**Helper function**

In [81]:
def is_valid_generation(text):
    t = text.lower()
    return ("patient question:" in t
            and "clinician question:" in t
            and "relevant sentences" in t
            and "[" in t and "]" in t)

In [82]:
def split_output(raw: str) -> str:
    # Split at the first occurrence of “Patient Question:”
    sep = "Patient Question:"
    parts = raw.split(sep, 1)
    if len(parts) < 2:
        raise ValueError("No Patient Question: in output")
    return sep + " " + parts[1].strip()

**API-Call function**

In [83]:
def generate_with_openai(
    prompt: str,
    model: str = "gpt-4o-mini",
    temperature: float = 0.9,
    max_tokens: int = 1024,
    top_p: float = 0.9,
    n=1
):
    resp = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",  "content": "You are a clinical-QA generator."},
            {"role": "user",    "content": prompt},
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        stop=["\n\n"],
        n=n
    )
    return [c.message.content.strip() for c in resp.choices]

**Prompt Building**

**Format few‐shot block**

In [84]:
def format_few_shot(df: pd.DataFrame, n=3) -> str:
    sampled_data = df.sample(n=n)
    exs = []
    for _, row in sampled_data.iterrows():
        numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
        relevant = [i+1 for i, lbl in enumerate(row.labels) if lbl]
        exs.append(
            f"Note Excerpt:\n{numbered}\n\n"
            f"Patient Question: {row.patient_question}\n"
            f"Clinician Question: {row.clinician_question}\n"
            f"Relevant Sentences: {relevant}"
        )
    return "\n\n---\n\n".join(exs)

**Generation Loop**

In [85]:
openai.api_key = openai_token

In [86]:
def safe_eval(x):
    return ast.literal_eval(x) if isinstance(x, str) else x

In [87]:
notes["sentences"] = notes["sentences"].apply(safe_eval)

In [88]:
outputs = []
num_qa_pairs = 6
num_examples = 1

for idx, row in tqdm(notes.iterrows(), total=len(notes), desc="Generating"):
    
    numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
    
    for i in range(num_qa_pairs):
        
        few_shot_block = format_few_shot(arch_data, n=num_examples)

        prompt = prompt_template.format(
                example_qas=few_shot_block,
                note=numbered
            )

        gens = generate_with_openai(prompt, n=1)

        # gens is now a list of three Q&A blocks
        for qa_text in gens:
            if not is_valid_generation(qa_text):
                print(f"[Warning] Bad generation for idx={idx!r}: {qa_text}")
                continue
            outputs.append({
                "id":            idx,
                "note_excerpt":  row.note_excerpt,
                "sentences":     row.sentences,
                "output":        qa_text,
        })

Generating:   0%|          | 0/20 [00:00<?, ?it/s]

In [89]:
first = outputs[0]
second = outputs[1]
third = outputs[2]

In [90]:
first["output"]

'Patient Question: Why did I need a stent for my bile duct, and what was found during the procedure?  \nClinician Question: What findings were observed during the ERCP regarding the biliary obstruction and subsequent management?  \nRelevant Sentences: [2, 4, 6, 7, 8]'

In [91]:
second["output"]

'Patient Question: Why did I need another procedure after the first ERCP?  \nClinician Question: What were the reasons for the re-evaluation of the biliary stent and subsequent procedures?  \nRelevant Sentences: [2, 3, 4, 6, 7, 8]'

In [92]:
third["output"]

'Patient Question: Why did I need a stent placed in my bile duct?  \nClinician Question: What was the indication for placing a stent in the common bile duct during the ERCP?  \nRelevant Sentences: [2, 3]'

In [93]:
for i, row in enumerate(first["sentences"]):
    print(f"{i+1}: {row}")
    print("\n")

1: Brief Hospital Course:


2: During the ERCP a pancreatic stent was required to facilitate access to the biliary system (removed at the end of the procedure), and a common bile duct stent was placed to allow drainage of the biliary obstruction caused by stones and sludge.


3: However, due to the patient's elevated INR, no sphincterotomy or stone removal was performed.


4: Frank pus was noted to be draining from the common bile duct, and post-ERCP it was recommended that the patient remain on IV Zosyn for at least a week.


5: The Vancomycin was discontinued.


6: On hospital day 4 (post-procedure day 3) the patient returned to ERCP for re-evaluation of her biliary stent as her LFTs and bilirubin continued an upward trend.


7: On ERCP the previous biliary stent was noted to be acutely obstructed by biliary sludge and stones.


8: As the patient's INR was normalized to 1.2, a sphincterotomy was safely performed, with removal of several biliary stones in addition to the common bile d

In [94]:
second["sentences"]

['Brief Hospital Course:',
 'During the ERCP a pancreatic stent was required to facilitate access to the biliary system (removed at the end of the procedure), and a common bile duct stent was placed to allow drainage of the biliary obstruction caused by stones and sludge.',
 "However, due to the patient's elevated INR, no sphincterotomy or stone removal was performed.",
 'Frank pus was noted to be draining from the common bile duct, and post-ERCP it was recommended that the patient remain on IV Zosyn for at least a week.',
 'The Vancomycin was discontinued.',
 'On hospital day 4 (post-procedure day 3) the patient returned to ERCP for re-evaluation of her biliary stent as her LFTs and bilirubin continued an upward trend.',
 'On ERCP the previous biliary stent was noted to be acutely obstructed by biliary sludge and stones.',
 "As the patient's INR was normalized to 1.2, a sphincterotomy was safely performed, with removal of several biliary stones in addition to the common bile duct sten

In [95]:
third["sentences"]

['Brief Hospital Course:',
 'During the ERCP a pancreatic stent was required to facilitate access to the biliary system (removed at the end of the procedure), and a common bile duct stent was placed to allow drainage of the biliary obstruction caused by stones and sludge.',
 "However, due to the patient's elevated INR, no sphincterotomy or stone removal was performed.",
 'Frank pus was noted to be draining from the common bile duct, and post-ERCP it was recommended that the patient remain on IV Zosyn for at least a week.',
 'The Vancomycin was discontinued.',
 'On hospital day 4 (post-procedure day 3) the patient returned to ERCP for re-evaluation of her biliary stent as her LFTs and bilirubin continued an upward trend.',
 'On ERCP the previous biliary stent was noted to be acutely obstructed by biliary sludge and stones.',
 "As the patient's INR was normalized to 1.2, a sphincterotomy was safely performed, with removal of several biliary stones in addition to the common bile duct sten

In [96]:
for i, row in enumerate(outputs):
    print(row["output"])
    print()
    print("\n")

Patient Question: Why did I need a stent for my bile duct, and what was found during the procedure?  
Clinician Question: What findings were observed during the ERCP regarding the biliary obstruction and subsequent management?  
Relevant Sentences: [2, 4, 6, 7, 8]



Patient Question: Why did I need another procedure after the first ERCP?  
Clinician Question: What were the reasons for the re-evaluation of the biliary stent and subsequent procedures?  
Relevant Sentences: [2, 3, 4, 6, 7, 8]



Patient Question: Why did I need a stent placed in my bile duct?  
Clinician Question: What was the indication for placing a stent in the common bile duct during the ERCP?  
Relevant Sentences: [2, 3]



Patient Question: Why did I need a stent if I had stones and sludge in my bile duct?  
Clinician Question: What was the rationale for placing a stent during the ERCP despite the presence of biliary stones and sludge?  
Relevant Sentences: [2, 3, 7]



Patient Question: Why did I need a stent duri

In [97]:
example = split_output(outputs[0]["output"])

In [98]:
example

'Patient Question: Why did I need a stent for my bile duct, and what was found during the procedure?  \nClinician Question: What findings were observed during the ERCP regarding the biliary obstruction and subsequent management?  \nRelevant Sentences: [2, 4, 6, 7, 8]'

**Quick sanity check**

In [99]:
notes.head()

Unnamed: 0,case_id,patient_question,clinician_question,note_excerpt,sentences,sentence_text,labels
0,1,My question is if the sludge was there does no...,Why was ERCP recommended to him over continuin...,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course:,"[0, 1, 0, 0, 0, 1, 1, 1, 0]"
1,2,dad given multiple shots of lasciks after he w...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course: Acute diastolic heart f...,"[Brief Hospital Course:, Acute diastolic heart...",Brief Hospital Course:,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
2,3,he is continously irritated and has headache w...,What is the expected course of recovery for him?,Discharge Instructions: You were admitted to t...,[Discharge Instructions: You were admitted to ...,Discharge Instructions: You were admitted to t...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
3,4,My doctor performed a cardiac catherization.,Why was cardiac catheterization recommended to...,History of Present Illness: On the cardiology ...,"[History of Present Illness:, On the cardiolog...",History of Present Illness:,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"I overdosed October 4th on trihexyphenidyl, th...",Is the pain connected to the overdose or somet...,"Brief Hospital Course: # Bipolar d/o, PTSD, sc...","[Brief Hospital Course:, # Bipolar d/o, PTSD, ...",Brief Hospital Course:,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."


In [100]:
print(prompt_template)

Here are examples on how to turn a clinical note excerpt into a patient‑style question, a clinician‑style question, and the sentences needed to answer it:

{example_qas}

---

Now you are given a new clinical note excerpt from a patient’s electronic health record (EHR):

---
{note}
---

Your task is to:
1. Write **exactly one** realistic patient‑style question someone might ask after reading this note.
2. Rewrite that question in a formal, clinician‑friendly format.
3. Identify which sentences in the note are **directly and literally** used to answer your question.

Don’t repeat the note itself in your answer—only output the three fields, exactly in this format:

Patient Question: <your patient question>  
Clinician Question: <your clinician question>  
Relevant Sentences: [<num1>, <num2>, …]


# Post Processing 

In [101]:
df_questions = pd.DataFrame(outputs)

**Rename the output column** 

In [102]:
df_questions = df_questions.rename(columns={'output':'question'})

**Remove Seperators from notes**

In [103]:
df_questions['note_excerpt'] = df_questions['note_excerpt'].str.replace(r'\|', '', regex=True)

In [104]:
df_questions.head()

Unnamed: 0,id,note_excerpt,sentences,question
0,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Patient Question: Why did I need a stent for m...
1,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Patient Question: Why did I need another proce...
2,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Patient Question: Why did I need a stent place...
3,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Patient Question: Why did I need a stent if I ...
4,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Patient Question: Why did I need a stent durin...


**Split question into: Patient Q, Clinician Q and Relevant Sentences**

In [105]:
df_questions[
    ['patient_question','clinician_question','relevant_sentences']
] = df_questions['question'].str.extract(
    r'Patient Question:\s*(.*?)\n'
    r'Clinician Question:\s*(.*?)\n'
    r'Relevant Sentences:\s*(.*)'
)

**Drop old question column**

In [106]:
df_questions = df_questions.drop(columns='question')

**Convert relevant_sentences from decimal to binary** 

In [107]:
# parse the literal list‐strings into actual lists
df_questions['relevant_sentences'] = (
    df_questions['relevant_sentences']
      .apply(ast.literal_eval)
)

In [108]:
df_questions['labels'] = df_questions.apply(
  lambda row: [
    1 if (i+1) in row['relevant_sentences'] else 0
    for i in range(len(row['sentences']))
  ],
  axis=1
)

**Drop old relevant sentences column**

In [109]:
df_questions.head()

Unnamed: 0,id,note_excerpt,sentences,patient_question,clinician_question,relevant_sentences,labels
0,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...","Why did I need a stent for my bile duct, and w...",What findings were observed during the ERCP re...,"[2, 4, 6, 7, 8]","[0, 1, 0, 1, 0, 1, 1, 1, 0]"
1,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Why did I need another procedure after the fir...,What were the reasons for the re-evaluation of...,"[2, 3, 4, 6, 7, 8]","[0, 1, 1, 1, 0, 1, 1, 1, 0]"
2,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Why did I need a stent placed in my bile duct?,What was the indication for placing a stent in...,"[2, 3]","[0, 1, 1, 0, 0, 0, 0, 0, 0]"
3,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Why did I need a stent if I had stones and slu...,What was the rationale for placing a stent dur...,"[2, 3, 7]","[0, 1, 1, 0, 0, 0, 1, 0, 0]"
4,0,Brief Hospital Course: During the ERCP a pancr...,"[Brief Hospital Course:, During the ERCP a pan...",Why did I need a stent during the procedure?,What were the indications for placing a biliar...,"[2, 3]","[0, 1, 1, 0, 0, 0, 0, 0, 0]"


In [110]:
df_questions = df_questions.drop(columns='relevant_sentences')

**Clean generated text columns**

In [111]:
text_columns = ["patient_question", "clinician_question", "note_excerpt"]
list_columns = ["sentences"]

In [112]:
df_questions = util.clean_text_df(df_questions, text_columns = text_columns, list_columns = list_columns)

**Reorder columns to improve readability**

In [113]:
df_questions = df_questions[[
    "patient_question",
    "clinician_question",
    "sentences",
    "note_excerpt",
    "labels"
]]

**Last quick check**

In [114]:
df_questions.head()

Unnamed: 0,patient_question,clinician_question,sentences,note_excerpt,labels
0,"Why did I need a stent for my bile duct, and w...",What findings were observed during the ERCP re...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course: During the ERCP a pancr...,"[0, 1, 0, 1, 0, 1, 1, 1, 0]"
1,Why did I need another procedure after the fir...,What were the reasons for the re-evaluation of...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course: During the ERCP a pancr...,"[0, 1, 1, 1, 0, 1, 1, 1, 0]"
2,Why did I need a stent placed in my bile duct?,What was the indication for placing a stent in...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course: During the ERCP a pancr...,"[0, 1, 1, 0, 0, 0, 0, 0, 0]"
3,Why did I need a stent if I had stones and slu...,What was the rationale for placing a stent dur...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course: During the ERCP a pancr...,"[0, 1, 1, 0, 0, 0, 1, 0, 0]"
4,Why did I need a stent during the procedure?,What were the indications for placing a biliar...,"[Brief Hospital Course:, During the ERCP a pan...",Brief Hospital Course: During the ERCP a pancr...,"[0, 1, 1, 0, 0, 0, 0, 0, 0]"


# Save

In [115]:
out_file  = OUTPUT_DIR / "real-notes-6-qs.csv"
df_questions.to_csv(out_file, index=False)
print(f"Saved to: {out_file}")

Saved to: ../data/synthetic/questions/real-notes-6-qs.csv


# Backup

In [101]:
def generate_from_vllm(prompt: str,
                       temperature: float = 0.7,
                       max_tokens: int = 1024,
                       retries: int = 3,
                       delay: int = 2) -> str:
    payload = {
        "prompt":     prompt,
        "temperature": temperature,
        "max_tokens":  max_tokens,
        "top_p":       0.9,
    }
    for attempt in range(retries):
        try:
            r = requests.post(VLLM_URL, json=payload)
            r.raise_for_status()            
            return r.json()["choices"][0]["text"].strip()        
        except Exception as e:
            print(f"[Warning] Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return ""

In [None]:
# Generate questions for each note
outputs = []
for idx, row in tqdm(notes_test.iterrows(), total=len(notes_test), desc="Generating"):
    numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(row.sentences))
    prompt = prompt_template.format(
        example_qas=few_shot_block,
        note=numbered
    )
    gen = generate_from_vllm(prompt)

    if is_valid_generation(gen):
        print(gen)
        break
        outputs.append({
            "id":           idx,
            "note_excerpt": row.note_excerpt,
            "sentences":    row.sentences,
            "output":       split_output(gen)
        })
    else:
        print(f"[Warning] Invalid output for idx {idx}")
        print("Generation: " + gen)