In [6]:
import pandas as pd
import numpy as np

# Load original dataset (~93k rows)
df = pd.read_parquet("diabetes-v2/data/predictions.parquet")

# Probabilistic hospital distribution
locations = {
    "Urban Medical Center": 0.30,
    "Regional Health System": 0.25,
    "Community Hospital": 0.20,
    "Specialty Diabetes Institute": 0.15,
    "Veterans Medical Facility": 0.10,
}

departments_by_location = {
    "Urban Medical Center": ["Endocrinology", "Cardiology", "Nephrology", "Emergency Department", "Geriatrics"],
    "Regional Health System": ["Internal Medicine", "Endocrinology", "Podiatry", "Pharmacy"],
    "Community Hospital": ["Internal Medicine", "Endocrinology", "Cardiology"],
    "Specialty Diabetes Institute": ["Endocrinology", "Nutrition Services", "Pharmacy"],
    "Veterans Medical Facility": ["Geriatrics", "Cardiology", "Pharmacy"],
}

specialties_by_department = {
    "Endocrinology": ["Insulin Management", "Glycemic Control", "Diabetes Education"],
    "Internal Medicine": ["Chronic Kidney Disease", "Post-Discharge Follow-Up"],
    "Cardiology": ["Cardiac Complications", "Peripheral Artery Disease"],
    "Nephrology": ["Diabetic Nephropathy", "Kidney Function Monitoring"],
    "Geriatrics": ["Elderly Diabetes Care", "Wound Healing"],
    "Nutrition Services": ["Inpatient Nutrition"],
    "Pharmacy": ["Medication Safety", "Hypoglycemia Monitoring"],
    "Emergency Department": ["Trauma Response", "Acute Complications"],
    "Podiatry": ["Diabetic Foot Care", "Wound Healing"],
}

# Irrelevant specialties for new appended rows (will be filtered out)
extra_specialties = [
    "Dermatology – Skin Allergy", "Ophthalmology – Vision Correction", "ENT – Sinus Surgery",
    "Orthopedics – Sports Injury", "Rheumatology – Joint Pain", "Pulmonology – Asthma Care",
    "Psychiatry – Depression Treatment", "Oncology – Chemotherapy", "Urology – Kidney Stones",
    "Obstetrics & Gynecology – Prenatal Care", "Infectious Disease – Travel Vaccines",
    "Hematology – Blood Disorders", "Surgery – Appendectomy", "Plastic Surgery – Cosmetic Procedures",
    "Neurology – Seizure Management", "Immunology – Allergy Testing", "Radiology – Imaging",
    "Anesthesiology – Pain Management", "Dentistry – Oral Surgery", "Speech Therapy – Language Disorders",
]

# === Assign hierarchy to existing rows (same as before) ===
rng = np.random.default_rng(seed=42)
location_choices = rng.choice(list(locations.keys()), size=len(df), p=list(locations.values()))

departments, specialties = [], []
for loc, age in zip(location_choices, df["age"]):
    dep = rng.choice(departments_by_location[loc])
    spec_options = specialties_by_department.get(dep, ["General Diabetes Care"])

    # Apply age rules (unchanged)
    if age.strip() in ["[0-10)", "[10-20)"]:
        spec_options = [s for s in spec_options if "Pediatric" in s or "Child" in s] or ["Pediatric Diabetes Care"]
    elif age.strip() in ["[70-80)", "[80-90)", "[90-100)"]:
        spec_options = [s for s in spec_options if "Elderly" in s or "Geri" in s] or ["Geriatric Diabetes Care"]

    departments.append(dep)
    specialties.append(rng.choice(spec_options))

df["location"] = location_choices
df["department"] = departments
df["specialty"] = specialties

# === Append NEW rows with irrelevant specialties ===
n_extra = int(0.05 * len(df))  # e.g., 5% additional rows
extra_df = df.sample(n=n_extra, random_state=42).copy()

# Assign irrelevant specialties to these new rows
extra_df["location"] = rng.choice(list(locations.keys()), size=n_extra)
extra_df["department"] = "Endocrinology"  # reuse a valid dept
extra_df["specialty"] = rng.choice(extra_specialties, size=n_extra)
# Ensure appended rows have unique IDs
extra_df["encounter_id"] = [f"X{i}" for i in range(len(df), len(df) + len(extra_df))]
extra_df["patient_nbr"] = [f"PX{i}" for i in range(len(df), len(df) + len(extra_df))]

# Combine original + new noise rows
df_augmented = pd.concat([df, extra_df], ignore_index=True)

df_augmented.to_parquet("predictionsHierarchy.parquet")


In [7]:
df_augmented["specialty"].value_counts()

specialty
Peripheral Artery Disease                  7958
Cardiac Complications                      7905
Glycemic Control                           7886
Diabetes Education                         7806
Insulin Management                         7690
Wound Healing                              7630
Hypoglycemia Monitoring                    7117
Medication Safety                          7073
Post-Discharge Follow-Up                   6516
Chronic Kidney Disease                     6510
Inpatient Nutrition                        4884
Elderly Diabetes Care                      4635
Diabetic Foot Care                         3104
Acute Complications                        3013
Trauma Response                            2936
Kidney Function Monitoring                 2935
Diabetic Nephropathy                       2892
Pediatric Diabetes Care                     850
Pulmonology – Asthma Care                   284
Surgery – Appendectomy                      263
Oncology – Chemotherapy       