In [1]:
import re
import time
import requests
import random
from pathlib import Path
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
import openai

sys.path.append("../configs")
from hf_config import hf_token, openai_token

# Setup

In [2]:
VLLM_URL = "http://localhost:8000/v1/completions"
PROMPTS_DIR = Path("prompts")
PROMPT_FILE    = PROMPTS_DIR / "generate_notes-few-shot.txt"
OUTPUT_DIR = Path("../data/synthetic/note-excerpts")
OUTPUT_DIR.mkdir(exist_ok=True)

# Build Prompt

**Define Prompt Template**

In [3]:
prompt = PROMPT_FILE.read_text()

**Building the prompt**

In [4]:
openai.api_key = openai_token

In [5]:
'''
resp = openai.models.list()
for m in resp.data:
    print(m.id)
''';

In [6]:
def generate_with_openai(
    prompt: str,
    model: str = "gpt-4o",
    temperature: float = 0.7,
    max_tokens: int = 1024,
    top_p: float = 0.95,
    n: int = 1,             # total number of completions you want
    batch_size: int = 1,    # how many to ask for per API call (<= model limit)
    retries: int = 3,
    backoff: float = 2.0,
) -> list[str]:
    """
    Generate `n` completions for the given prompt, in batches of at most `batch_size`.
    Returns exactly `n` strings (or raises on repeated failure).
    """
    results: list[str] = []
    while len(results) < n:
        to_request = min(batch_size, n - len(results))
        for attempt in range(1, retries + 1):
            try:
                resp = openai.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a clinical-note generator."},
                        {"role": "user",   "content": prompt},
                    ],
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=top_p,
                    n=to_request,           # ask for this many completions
                )
                chunk = [c.message.content.strip() for c in resp.choices]
                results.extend(chunk)
                break  # success: break out of the retry loop
            except Exception as e:
                print(f"[Warning] OpenAI API error (attempt {attempt}/{retries}): {e}")
                if attempt == retries:
                    raise
                time.sleep(backoff ** attempt)
    return results


In [8]:
def strip_example_intro(text: str) -> str:
    """
    Remove any leading lines like “— Example 5 —” (or the dashes block)
    up to and including the first blank line.
    """
    # cut everything before the first blank line
    parts = re.split(r'^\s*[-—]{3,}.*\n', text, flags=re.MULTILINE)
    # if we found fragments, take the last piece
    return parts[-1].lstrip()

In [9]:
def remove_end_tag(raw_output: str, end_token: str = "***END NOTE***") -> str:
    """
    Extracts the clinical note from a raw LLM response by
    removing everything starting with the end_token.
    """
    # Find the position of the end token
    idx = raw_output.find(end_token)
    if idx != -1:
        # Return everything before the end token, stripped of extra whitespace
        return raw_output[:idx].strip()
    # If no end token found, just return the trimmed raw output
    return raw_output.strip()

In [10]:
def remove_intro_and_tags(text: str) -> str:
    return remove_end_tag(strip_example_intro(text))

In [11]:
n = 1200
batch_size = 10

In [12]:
notes = generate_with_openai(prompt, model="gpt-4o", temperature=0.9, max_tokens=1024, n=n, batch_size=batch_size)

In [37]:
cleaned_notes = [remove_intro_and_tags(note) for note in notes]

In [38]:
sampled_outputs = random.sample(notes, min(n, 10))

In [39]:
for note in sampled_outputs:
    print(remove_intro_and_tags(note))
    print("\n")
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
    print("\n")

operative report the patient presented with worsening abdominal pain nausea and vomiting over the course of three days | an exploratory laparotomy revealed a perforated diverticulum in the sigmoid colon | sigmoid resection with primary anastomosis was performed without complications by dr lastname on | brief hospital course postoperatively the patient required a prolonged course of iv antibiotics due to peritoneal contamination as evidenced by leukocytosis and fever persisting despite initial therapy | the patient was placed on tpn on hospital day 2 due to ileus and was gradually advanced to oral intake by day 7 | wound care was performed daily due to copious serosanguinous drainage from the surgical site | the patient’s hemoglobin decreased postoperatively requiring transfusion of two units of packed red blood cells on hospital day 5 | on hospital day 10 the patient developed atrial fibrillation with rapid ventricular response necessitating initiation of metoprolol and anticoagulation

# Save

In [17]:
name = "few_shot_gpt4_separated_V2.csv"

In [19]:
len(notes)

1200

In [40]:
out_path = OUTPUT_DIR / name
df = pd.DataFrame(cleaned_notes, columns=["note_excerpt"])
df.to_csv(out_path, index=False)
print(f"✅ Saved: {out_path}")

✅ Saved: ../data/synthetic/note-excerpts/few_shot_gpt4_separated_V2.csv
