In [2]:
import re
import time
import requests
import random
from pathlib import Path
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
import openai

sys.path.append("../configs")
from hf_config import hf_token, openai_token

# Setup

In [10]:
VLLM_URL = "http://localhost:8000/v1/completions"
PROMPTS_DIR = Path("prompts")
PROMPT_FILE    = PROMPTS_DIR / "generate_notes-few-shot.txt"
OUTPUT_DIR = Path("../data/synthetic/note-excerpts")
OUTPUT_DIR.mkdir(exist_ok=True)

# Build Prompt

**Define Prompt Template**

In [11]:
prompt = PROMPT_FILE.read_text()

In [12]:
prompt

'Below are 4 note excerpts. Replicate their style, header selection, noise patterns and scenario variety.\n\n--- Example 1 ---  \nbrief hospital course during the ercp a pancreatic stent was required to facilitate access to the biliary system removed at the end of the procedure and a common bile duct stent was placed to allow drainage of the biliary obstruction caused by stones and sludge however due to the patients elevated inr no sphincterotomy or stone removal was performed frank pus was noted to be draining from the common bile duct and postercp it was recommended that the patient remain on iv zosyn for at least a week the vancomycin was discontinued on hospital day 4 postprocedure day 3 the patient returned to ercp for reevaluation of her biliary stent as her lfts and bilirubin continued an upward trend on ercp the previous biliary stent was noted to be acutely obstructed by biliary sludge and stones as the patients inr was normalized to 12 a sphincterotomy was safely performed wi

**Building the prompt**

In [204]:
openai.api_key = openai_token

In [205]:
'''
resp = openai.models.list()
for m in resp.data:
    print(m.id)
''';

In [206]:
def generate_with_openai(
    prompt: str,
    model: str = "gpt-4o",
    temperature: float = 0.7,
    max_tokens: int = 1024,
    top_p: float = 0.95,
    n: int = 1,             # total number of completions you want
    batch_size: int = 1,    # how many to ask for per API call (<= model limit)
    retries: int = 3,
    backoff: float = 2.0,
) -> list[str]:
    """
    Generate `n` completions for the given prompt, in batches of at most `batch_size`.
    Returns exactly `n` strings (or raises on repeated failure).
    """
    results: list[str] = []
    while len(results) < n:
        to_request = min(batch_size, n - len(results))
        for attempt in range(1, retries + 1):
            try:
                resp = openai.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a clinical-note generator."},
                        {"role": "user",   "content": prompt},
                    ],
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=top_p,
                    n=to_request,           # ask for this many completions
                )
                chunk = [c.message.content.strip() for c in resp.choices]
                results.extend(chunk)
                break  # success: break out of the retry loop
            except Exception as e:
                print(f"[Warning] OpenAI API error (attempt {attempt}/{retries}): {e}")
                if attempt == retries:
                    raise
                time.sleep(backoff ** attempt)
    return results


In [207]:
def strip_example_intro(text: str) -> str:
    """
    Remove any leading lines like “— Example 5 —” (or the dashes block)
    up to and including the first blank line.
    """
    # cut everything before the first blank line
    parts = re.split(r'^\s*[-—]{3,}.*\n', text, flags=re.MULTILINE)
    # if we found fragments, take the last piece
    return parts[-1].lstrip()

In [208]:
def remove_end_tag(raw_output: str, end_token: str = "***END NOTE***") -> str:
    """
    Extracts the clinical note from a raw LLM response by
    removing everything starting with the end_token.
    """
    # Find the position of the end token
    idx = raw_output.find(end_token)
    if idx != -1:
        # Return everything before the end token, stripped of extra whitespace
        return raw_output[:idx].strip()
    # If no end token found, just return the trimmed raw output
    return raw_output.strip()

In [209]:
def clean_note(text: str) -> str:
    return remove_end_tag(strip_example_intro(text))

In [219]:
n = 300
batch_size = 10

In [220]:
notes = generate_with_openai(prompt, model="gpt-4o", temperature=0.9, max_tokens=1024, n=n, batch_size=batch_size)

In [223]:
sampled_outputs = random.sample(notes, min(n, 5))

In [None]:
for note in sampled_outputs:
    print(clean_note(note))
    print("\n")
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
    print("\n")

# Save

In [229]:
name = "few_shot_gpt4_separated.csv"

In [225]:
cleaned_notes = []
for note in notes:
    cleaned_notes.append(clean_note(note))

In [232]:
for note in cleaned_notes:
    print(note)
    break

surgical pathology consult the patient a 72yearold female presented with hematuria and was found to have a 6 cm complex cystic lesion on her right kidney via ct scan | history includes stage 2 breast cancer sp lumpectomy and adjuvant radiation in 7191 | the renal mass raised suspicion for malignancy and a nephrectomy was advised by her oncologist dr | pathology report confirmed clear cell renal carcinoma grade 2 with negative margins | patient was then referred to medical oncology for adjuvant therapy considerations | brief hospital course patient was admitted for elective right radical nephrectomy performed by urology on | intraoperatively no significant complications occurred | postoperative recovery was uneventful and the patient was advanced to a regular diet by pod 2 | on pod 3 patient developed a fever of 38.5c and tachycardia | blood cultures were positive for klebsiella pneumoniae and pt was started on iv ceftriaxone | fever resolved by pod 5 and patient completed a full 7day a

In [233]:
out_path = OUTPUT_DIR / name
df = pd.DataFrame(cleaned_notes, columns=["note_excerpt"])
df.to_csv(out_path, index=False)
print(f"✅ Saved: {out_path}")

✅ Saved: ../data/synthetic/note-excerpts/few_shot_gpt4_separated.csv
