In [1]:
import argparse
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

def parse_args():
    ap = argparse.ArgumentParser(description="Strict GMAlerts synthetic data generator")
    ap.add_argument("--patients", type=int, default=50, help="Number of patients")
    ap.add_argument("--start-date", type=str, default=None, help="YYYY-MM-DD (default: today @ 00:00)")
    ap.add_argument("--src", type=str, default="GMAlertsDataset.csv", help="Source CSV for column order/types")
    ap.add_argument("--out", type=str, default="Synthetic_Output.csv", help="Output CSV")
    ap.add_argument("--seed", type=int, default=42, help="Random seed")
    args, _ = ap.parse_known_args()
    return args

def compute_start_date(arg):
    if arg:
        return datetime.strptime(arg, "%Y-%m-%d").replace(hour=0, minute=0, second=0, microsecond=0)
    now = datetime.now()
    return now.replace(hour=0, minute=0, second=0, microsecond=0)

def main():
    args = parse_args()
    rng = np.random.default_rng(args.seed)
    np.random.seed(args.seed)

    src = pd.read_csv(args.src, dtype=str)
    src_cols = list(src.columns)
    base_start = compute_start_date(args.start_date)

    # Patient risk for a bit of variety
    thirds = args.patients // 3
    risks = (["low"]*thirds) + (["medium"]*thirds) + (["high"]*(args.patients-2*thirds))
    rng.shuffle(risks)

    times = [0, 6, 12, 18]
    rows = []
    for i in range(args.patients):
        pid = f"P{i+1:04d}"
        risk = risks[i]
        day0 = base_start + timedelta(days=i)
        age = int(rng.integers(25, 85))
        gender = rng.choice(["Male", "Female"])
        bathroomVisits = int(rng.integers(2, 7))

        for j, h in enumerate(times):
            start_ts = day0 + timedelta(hours=h)
            end_ts = start_ts + timedelta(hours=6)
            row = {c: "" for c in src_cols}

            row["patientId"] = pid
            row["age"] = age
            row["gender"] = gender
            row["observationStart"] = start_ts.strftime("%Y-%m-%d %H:%M")
            row["observationEnd"] = end_ts.strftime("%Y-%m-%d %H:%M")
            row["nursingNote"] = rng.choice([
                "Stable, no new complaints.",
                "Reports mild headache.",
                "Resting, mood normal.",
                "Observed sleeping well.",
                "No abnormal findings.",
                "Patient had a meal."
            ])
            row["medications"] = rng.choice([
                "None",
                "Paracetamol",
                "Aspirin",
                "Antibiotic",
                "Multivitamin",
                "Ibuprofen"
            ])
            row["heartRate"] = int(rng.normal(75 if risk=="low" else 90 if risk=="medium" else 110, 6))
            row["spo2"] = int(rng.normal(98 if risk=="low" else 95 if risk=="medium" else 90, 2))
            row["temperature"] = round(rng.normal(36.7 if risk=="low" else 37.5 if risk=="medium" else 38.2, 0.5), 1)
            row["bloodPressure"] = f"{int(rng.normal(120 if risk=='low' else 135 if risk=='medium' else 150, 10))}/{int(rng.normal(80 if risk=='low' else 88 if risk=='medium' else 97, 5))}"
            row["stepsTaken"] = int(rng.integers(7000, 12000) if risk=="low" else rng.integers(3500, 7000) if risk=="medium" else rng.integers(500, 3500))
            row["calorieIntake"] = int(rng.integers(1800, 2600) if risk=="low" else rng.integers(1300, 1800) if risk=="medium" else rng.integers(900, 1300))
            row["sleepHours"] = round(rng.normal(7.2 if risk=="low" else 6 if risk=="medium" else 4.8, 0.8), 1)
            row["waterIntakeMl"] = int(rng.integers(1500, 3000) if risk=="low" else rng.integers(1000, 1800) if risk=="medium" else rng.integers(400, 1000))
            row["mealsSkipped"] = int(rng.integers(0, 2) if risk=="low" else rng.integers(1, 3) if risk=="medium" else rng.integers(2, 4))
            row["exerciseMinutes"] = int(rng.integers(25, 60) if risk=="low" else rng.integers(10, 30) if risk=="medium" else rng.integers(0, 15))
            row["bathroomVisits"] = bathroomVisits
            row["behaviourTags"] = rng.choice(["calm", "anxious", "active", "restless", "cooperative"])
            row["emotionTags"] = rng.choice(["happy", "sad", "neutral", "frustrated", "calm", "tense"])
            row["clinicalSummary"] = rng.choice([
                "No acute distress. Continue current plan.",
                "Vitals stable. Follow up in the morning.",
                "Mild elevation in blood pressure.",
                "Patient responding well to medications.",
                "Observe for further symptoms."
            ])
            row["entitiesExtracted"] = rng.choice([
                "headache; fatigue", "none", "fever", "cough; cold", "hypertension", "anxiety"
            ])
            row["baselineStats"] = rng.choice([
                "baseline normal", "elevated BP", "reduced sleep", "increased steps", "reduced appetite"
            ])
            row["state"] = rng.choice(["active", "resting", "asleep", "alert", "discharged"])

            rows.append(row)

    out = pd.DataFrame(rows, columns=src_cols)
    out.to_csv(args.out, index=False)
    print(f"Saved {len(out)} rows ({args.patients} × 4) to: {args.out}")
    print("Start date:", base_start.strftime("%Y-%m-%d"),
          "(execution-based)" if args.start_date is None else "(overridden)")

if __name__ == "__main__":
    main()


Saved 200 rows (50 × 4) to: Synthetic_Output.csv
Start date: 2025-09-20 (execution-based)
