In [15]:
import numpy as np
import pandas as pd
import random
from datetime import timedelta, date


In [16]:
# ----------------------------
# Config
# ----------------------------
NUM_PATIENTS = 500
SAMPLES_PER_PATIENT = 12
START_DATE = date(2024, 2, 15)

MIN_AGE, MAX_AGE = 16, 65
GENDER_DIST = {"Male": 0.5, "Female": 0.5}
MESSY_MODE = False  # set True to inject occasional blanks/typos


In [17]:
# ----------------------------
# Clinical ranges
# ----------------------------
TEMP_NORMAL_C = (36.5, 37.2)
TEMP_FEVER_C = (38.0, 41.0)
TEMP_HYPOTHERMIA_C = (32.0, 35.0)

BP_NORMAL_SYSTOLIC = (90, 120)
BP_NORMAL_DIASTOLIC = (60, 80)

HR_RANGE = (60, 100)      # bpm
RR_RANGE = (12, 16)       # breaths/min

ECG_PAPER_SPEED_MM_PER_SEC = 25  # metadata

SPO2_BANDS = {
    "normal": (95, 100, 0.85),
    "mild":   (90, 94, 0.12),
    "low":    (80, 89, 0.03),
}

HEIGHT_RANGE_CM = {
    "Male":   (160, 195),
    "Female": (150, 185),
}
BMI_CATEGORIES = {
    "Underweight": (None, 18.5),
    "Normal": (18.5, 24.9),
    "Overweight": (25, 29.9),
    "Obese": (30, None),
}
BMI_WEIGHTS = {
    "Male":   (0.08, 0.62, 0.20, 0.10),
    "Female": (0.12, 0.58, 0.20, 0.10),
}

ACTIVITY_LEVELS = ["Sedentary", "Active", "Highly_Active"]


In [18]:
# ----------------------------
# Helpers
# ----------------------------
def choose_gender():
    return random.choices(list(GENDER_DIST.keys()), weights=list(GENDER_DIST.values()))[0]

def generate_age():
    return random.randint(MIN_AGE, MAX_AGE)

def generate_temperature(age):
    status = random.choices(['normal', 'fever', 'hypothermia'], weights=[0.85, 0.10, 0.05])[0]
    if status == 'fever':
        return round(random.uniform(*TEMP_FEVER_C), 1)
    elif status == 'hypothermia':
        return round(random.uniform(*TEMP_HYPOTHERMIA_C), 1)
    else:
        return round(random.uniform(*TEMP_NORMAL_C), 1)

def generate_blood_pressure(age, gender, week_num):
    systolic = random.randint(*BP_NORMAL_SYSTOLIC)
    diastolic = random.randint(*BP_NORMAL_DIASTOLIC)
    # sex difference
    if gender == "Male" and age < 60:
        systolic += 2; diastolic += 1
    if gender == "Female" and age >= 60:
        systolic += 2; diastolic += 1
    # age effect
    if age > 50:
        systolic += 8; diastolic += 4
    # trend
    if week_num > 6:
        systolic += week_num; diastolic += 2
    return f"{systolic}/{diastolic}"

def parse_bp(bp_str):
    s, d = bp_str.split('/')
    return int(s), int(d)

def generate_height_weight(gender):
    h_lo, h_hi = HEIGHT_RANGE_CM[gender]
    height_cm = random.randint(h_lo, h_hi)
    cat = random.choices(list(BMI_CATEGORIES.keys()), weights=BMI_WEIGHTS[gender])[0]
    lo, hi = BMI_CATEGORIES[cat]
    lo = 15.0 if lo is None else lo
    hi = 40.0 if hi is None else hi
    bmi = random.uniform(lo, hi)
    weight_kg = bmi * (height_cm / 100) ** 2
    return height_cm, round(weight_kg, 1), round(bmi, 1)

def calculate_bmi(weight_kg, height_cm):
    return round(weight_kg / ((height_cm / 100) ** 2), 1)

def generate_spo2_percent():
    bands, weights = [], []
    for _, (lo, hi, w) in SPO2_BANDS.items():
        bands.append((lo, hi)); weights.append(w)
    lo, hi = random.choices(bands, weights=weights)[0]
    return random.randint(lo, hi)

def generate_ecg_speed():
    return ECG_PAPER_SPEED_MM_PER_SEC

def generate_activity_level(age, gender):
    base = [0.50, 0.35, 0.15]   # Sedentary, Active, Highly_Active
    if age < 30:  base = [0.40, 0.40, 0.20]
    elif age > 55: base = [0.60, 0.32, 0.08]
    if gender == "Male":
        base = [base[0]-0.02, base[1], base[2]+0.02]
    s = sum(base); probs = [max(0.01, p/s) for p in base]
    return random.choices(ACTIVITY_LEVELS, weights=probs)[0]

def generate_step_count(activity, age, gender):
    if activity == "Sedentary": mu, sd = 3000, 1200
    elif activity == "Active":  mu, sd = 7000, 2500
    else:                       mu, sd = 12000, 3500
    age_penalty = -40 * max(0, age - 30)
    sex_delta = 250 if gender == "Male" else 0
    return int(max(0, random.gauss(mu + age_penalty + sex_delta, sd)))

def generate_sleep_hours(activity, age, stress_hint=None):
    base = 7.2 + (0.2 if activity == "Highly_Active" else -0.2 if activity == "Sedentary" else 0.0)
    if age > 55: base -= 0.15
    if stress_hint is not None:
        base -= 0.12 * (stress_hint - 5)
    sleep = random.gauss(base, 0.8)
    return round(float(np.clip(sleep, 3.5, 10.0)), 3)

def generate_stress_level(steps, sleep, age):
    mean = 5.0 + (-0.00015 * steps) + (-0.35 * (sleep - 7.0)) + (0.4 if age > 55 else 0.0)
    return int(round(np.clip(random.gauss(mean, 1.1), 1, 10)))

def generate_heart_rate(age, gender, activity, stress):
    hr = random.randint(*HR_RANGE)
    if gender == "Female": hr += random.randint(2, 5)
    if activity == "Highly_Active": hr -= random.randint(3, 8)
    if age > 50: hr += random.randint(0, 5)
    hr += random.randint(0, max(0, stress - 5))
    return int(np.clip(hr, 45, 200))

def generate_respiration_rate(age, gender, activity):
    rr = random.randint(*RR_RANGE)
    if gender == "Female": rr += random.choice([0, 1])
    if activity == "Highly_Active": rr -= random.choice([0, 1])
    if age > 60: rr += random.choice([0, 1])
    return int(np.clip(rr, 10, 24))

def rr_interval_ms_from_hr(hr_bpm, jitter_ms=20):
    base = 60000.0 / max(hr_bpm, 1)
    return int(max(300, base + random.randint(-jitter_ms, jitter_ms)))

def determine_risk_level(bmi, bp_str, age):
    s, d = parse_bp(bp_str)
    if bmi > 30 or s >= 140 or d >= 90 or age > 60:
        return "High Risk for CVD"
    elif bmi > 25 or s >= 120 or d >= 80 or age > 50:
        return "Moderate Risk for CVD or Diabetes"
    else:
        return "Low Risk"

def simulate_treatment_action(risk_level):
    return "Administer Medication" if risk_level in {"High Risk for CVD", "Moderate Risk for CVD or Diabetes"} else "Monitor"

def maybe_make_messy(value, kind="num"):
    if not MESSY_MODE: return value
    p = random.random()
    if p < 0.03: return ""           # blank
    if p < 0.05 and kind == "num": return "ERROR"
    if p < 0.06 and kind == "activity": return random.choice(["Actve", "Seddentary", "Highly Active", "Highly_Active"])
    return value



In [19]:
# ----------------------------
# Data generation
# ----------------------------
records = []
for patient_id in range(1, NUM_PATIENTS + 1):
    gender = choose_gender()
    age = generate_age()
    height_cm, weight_kg, bmi = generate_height_weight(gender)

    base_date = START_DATE

    for i in range(SAMPLES_PER_PATIENT):
        activity = generate_activity_level(age, gender)
        steps = generate_step_count(activity, age, gender)
        stress_hint = 5.0 + (-0.00015 * steps) + (0.4 if age > 55 else 0.0)
        sleep_h = generate_sleep_hours(activity, age, stress_hint=stress_hint)
        stress = generate_stress_level(steps, sleep_h, age)

        bp = generate_blood_pressure(age, gender, i)
        temp_c = generate_temperature(age)
        spo2 = generate_spo2_percent()
        hr = generate_heart_rate(age, gender, activity, stress)
        rr = generate_respiration_rate(age, gender, activity)
        ecg_speed = generate_ecg_speed()
        ecg_rr_ms = rr_interval_ms_from_hr(hr)

        risk_level = determine_risk_level(bmi, bp, age)
        action = simulate_treatment_action(risk_level)

        rec = {
            "Patient ID": f"PAT_{patient_id:05d}",
            "Gender": gender,
            "Date": base_date.strftime('%Y-%m-%d'),
            "Age": age,
            "Temperature_C": maybe_make_messy(round(temp_c, 1), "num"),
            "Blood Pressure_mmHg": maybe_make_messy(bp, "num"),
            "Heart Rate_bpm": maybe_make_messy(hr, "num"),
            "Respiration Rate_brpm": maybe_make_messy(rr, "num"),  # breaths/min
            "SpO2_percent": maybe_make_messy(spo2, "num"),
            "ECG_PaperSpeed_mm_per_s": ecg_speed,
            "ECG_RR_interval_ms": ecg_rr_ms,
            "Height_cm": height_cm,
            "Weight_kg": weight_kg,
            "BMI": bmi,
            "Activity Level": maybe_make_messy(activity, "activity"),
            "Step Count": maybe_make_messy(steps, "num"),
            "Sleep Duration_hours": maybe_make_messy(sleep_h, "num"),
            "Stress Level": maybe_make_messy(stress, "num"),
            "Risk Level": risk_level,
            "Action": action
        }

        records.append(rec)
        base_date += timedelta(days=7)


In [20]:
df = pd.DataFrame(records)
df["Blood Oxygen Level (%)"] = df["SpO2_percent"]  # alias if you need that header
df.to_csv('medical_dataset.csv', index=False)
df.head(20)

Unnamed: 0,Patient ID,Gender,Date,Age,Temperature_C,Blood Pressure_mmHg,Heart Rate_bpm,Respiration Rate_brpm,SpO2_percent,ECG_PaperSpeed_mm_per_s,...,Height_cm,Weight_kg,BMI,Activity Level,Step Count,Sleep Duration_hours,Stress Level,Risk Level,Action,Blood Oxygen Level (%)
0,PAT_00001,Female,2024-02-15,44,37.1,117/69,81,15,87,25,...,175,90.0,29.4,Sedentary,1720,7.056,5,Moderate Risk for CVD or Diabetes,Administer Medication,87
1,PAT_00001,Female,2024-02-22,44,36.8,102/72,63,12,97,25,...,175,90.0,29.4,Highly_Active,11813,6.923,4,Moderate Risk for CVD or Diabetes,Administer Medication,97
2,PAT_00001,Female,2024-02-29,44,36.6,118/68,87,13,96,25,...,175,90.0,29.4,Sedentary,4340,7.118,4,Moderate Risk for CVD or Diabetes,Administer Medication,96
3,PAT_00001,Female,2024-03-07,44,37.1,115/70,77,14,100,25,...,175,90.0,29.4,Sedentary,1840,6.876,3,Moderate Risk for CVD or Diabetes,Administer Medication,100
4,PAT_00001,Female,2024-03-14,44,36.5,105/71,88,16,97,25,...,175,90.0,29.4,Active,3594,8.794,4,Moderate Risk for CVD or Diabetes,Administer Medication,97
5,PAT_00001,Female,2024-03-21,44,36.8,93/75,89,13,93,25,...,175,90.0,29.4,Active,2831,7.983,4,Moderate Risk for CVD or Diabetes,Administer Medication,93
6,PAT_00001,Female,2024-03-28,44,36.9,96/80,95,13,90,25,...,175,90.0,29.4,Sedentary,1907,7.234,6,Moderate Risk for CVD or Diabetes,Administer Medication,90
7,PAT_00001,Female,2024-04-04,44,36.9,118/67,70,15,95,25,...,175,90.0,29.4,Active,6407,6.849,5,Moderate Risk for CVD or Diabetes,Administer Medication,95
8,PAT_00001,Female,2024-04-11,44,37.0,102/67,69,14,96,25,...,175,90.0,29.4,Sedentary,2256,5.787,6,Moderate Risk for CVD or Diabetes,Administer Medication,96
9,PAT_00001,Female,2024-04-18,44,39.7,114/68,68,14,95,25,...,175,90.0,29.4,Highly_Active,8070,7.056,4,Moderate Risk for CVD or Diabetes,Administer Medication,95
