In [1]:
pip install faker


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
"""
generate_nurse_reports.py
-------------------------------------------------
Creates a CSV of synthetic nurse shift reports for
NLP experiments (summarisation, anomaly detection, etc.).
No paid API or cloud service required.

Author: (your name)
"""

from faker import Faker
import numpy as np
import pandas as pd
import random
import datetime as dt
import textwrap

# ------------------------------------------------------------------
# 1.  Initial setup
# ------------------------------------------------------------------
SEED = 42                       # Re-run with the same seed for identical output
random.seed(SEED)
np.random.seed(SEED)
fake = Faker("en_AU")           # Australian dates/spelling
Faker.seed(SEED)

# ------------------------------------------------------------------
# 2.  Vocabulary pools – edit to match your clinical context
# ------------------------------------------------------------------
WARDS = ["Cardiology", "Oncology", "Orthopaedics", "ICU"]
SHIFTS = ["AM", "PM", "Night"]
CONDITIONS = [
    "post-op knee replacement",
    "COPD exacerbation",
    "diabetic ketoacidosis",
    "sepsis",
    "stroke"
]
ACTIONS = [
    "administered IV antibiotics",
    "changed wound dressing",
    "mobilised with physiotherapist",
    "monitored blood glucose",
    "provided education on inhaler use"
]
VITALS = [
    "within normal range",
    "BP slightly elevated",
    "tachycardic overnight",
    "SpO₂ 92 % on room air"
]
PLANS = [
    "continue current antibiotics",
    "review analgesia",
    "chase blood culture results",
    "physio to assess mobility"
]

# ------------------------------------------------------------------
# 3.  Helper to build one patient paragraph
# ------------------------------------------------------------------
rng = np.random.default_rng(SEED)

def make_patient_block() -> str:
    """Return one paragraph describing a single patient."""
    alias  = f"P{rng.integers(1000, 1999)}"
    age    = rng.integers(25, 95)
    cond   = random.choice(CONDITIONS)
    action = random.choice(ACTIONS)
    vit    = random.choice(VITALS)
    plan   = random.choice(PLANS)

    return (
        f"{alias} ({age} y o) admitted with {cond}. "
        f"During shift, {action}. "
        f"Vitals {vit}. "
        f"Plan: {plan}."
    )

# ------------------------------------------------------------------
# 4.  Build an entire note (header + 1-n patient blocks + footer)
# ------------------------------------------------------------------
def generate_note(note_id: int) -> str:
    date   = fake.date_between(start_date="-14d", end_date="today")
    ward   = random.choice(WARDS)
    shift  = random.choice(SHIFTS)
    nurse  = fake.name()

    n_patients = max(1, rng.poisson(2))   # mean ≈ 2 patients per nurse
    patient_sections = "\n".join(make_patient_block() for _ in range(n_patients))

    footer = f"– {nurse.split()[0]} (RN)  {dt.datetime.now().strftime('%H:%M')}"

    note = textwrap.dedent(f"""
        ### Nurse Shift Report {note_id}
        Date: {date}    Shift: {shift}    Ward: {ward}
        ------------------------------------------------------------
        {patient_sections}
        ------------------------------------------------------------
        {footer}
    """).strip()

    return note

# ------------------------------------------------------------------
# 5.  Build a whole dataset and save to disk
# ------------------------------------------------------------------
def build_dataset(n_notes: int = 500,
                  out_path: str = "synthetic_nurse_notes.csv") -> None:
    """Generate *n_notes* reports and write them to *out_path*."""
    notes = [generate_note(i + 1) for i in range(n_notes)]
    df = pd.DataFrame({"note_id": range(1, n_notes + 1), "text": notes})
    df.to_csv(out_path, index=False)
    print(f"✔  Saved {n_notes} notes to '{out_path}'")

# ------------------------------------------------------------------
# 6.  Script entry-point
# ------------------------------------------------------------------
if __name__ == "__main__":
    build_dataset(500, "synthetic_nurse_notes.csv")   # <-- adjust here if needed


✔  Saved 500 notes to 'synthetic_nurse_notes.csv'


In [6]:
import pandas as pd

df = pd.read_csv("synthetic_nurse_notes.csv")    # ← points to the file you saved
print("\n=== Preview of first 20 synthetic notes ===")
print(df.head(20).to_string(index=False))



=== Preview of first 20 synthetic notes ===
 note_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [7]:
import pandas as pd
import re

df = pd.read_csv("synthetic_nurse_notes.csv")

# pull the YYYY-MM-DD that appears right after 'Date:'
df['date'] = pd.to_datetime(
    df['text'].str.extract(r'Date:\s*([\d-]+)')[0],   # first capture group
    format='%Y-%m-%d'
)

# now sort by date (and maybe by shift too, if you like)
df_sorted = df.sort_values('date', ascending=True)  # newest last

# preview
print("\n=== First 20 notes in DATE order ===")
print(df_sorted[['note_id', 'date']].head(20).to_string(index=False))



=== First 20 notes in DATE order ===
 note_id       date
     296 2025-05-02
     466 2025-05-02
     240 2025-05-02
     313 2025-05-02
     123 2025-05-02
     125 2025-05-02
      26 2025-05-02
     481 2025-05-02
     341 2025-05-02
      32 2025-05-02
     487 2025-05-02
     164 2025-05-02
      35 2025-05-02
     348 2025-05-02
     294 2025-05-02
     147 2025-05-02
     117 2025-05-02
     491 2025-05-02
     120 2025-05-02
     427 2025-05-02


In [8]:
for _, row in df_sorted.head(5).iterrows():   # change 5 to however many you want
    print("\n" + "="*80)
    print(row['text'])



### Nurse Shift Report 296
Date: 2025-05-02    Shift: AM    Ward: ICU
------------------------------------------------------------
P1327 (62 y o) admitted with stroke. During shift, provided education on inhaler use. Vitals within normal range. Plan: physio to assess mobility.
------------------------------------------------------------
– Robert (RN)  12:49

### Nurse Shift Report 466
Date: 2025-05-02    Shift: Night    Ward: Cardiology
------------------------------------------------------------
P1451 (73 y o) admitted with diabetic ketoacidosis. During shift, changed wound dressing. Vitals SpO₂ 92 % on room air. Plan: physio to assess mobility.
------------------------------------------------------------
– Eric (RN)  12:49

### Nurse Shift Report 240
        Date: 2025-05-02    Shift: PM    Ward: Oncology
        ------------------------------------------------------------
        P1414 (39 y o) admitted with diabetic ketoacidosis. During shift, monitored blood glucose. Vitals tachy