In [None]:
import numpy as np
import pandas as pd
import os
import random

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns

RNG_SEED = 42

def seed_all(seed: int = RNG_SEED) -> None:
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)


In [None]:
seed_all()

Data002_Pat = pd.read_csv("/content/drive/MyDrive/Paper(2025Dec)_SimulatedCVD/Data002_PatientEMR_MasterSummary.csv")
Data002_Pat = Data002_Pat.drop(['Unnamed: 0'], axis = 1)

Data002_Chro = pd.read_csv("/content/drive/MyDrive/Paper(2025Dec)_SimulatedCVD/Data002_PatientEMR_ChronicDiseases.csv")
Data002_Chro = Data002_Chro.drop(['Unnamed: 0'], axis = 1)

Data002_Meas = pd.read_csv("/content/drive/MyDrive/Paper(2025Dec)_SimulatedCVD/Data002_PatientEMR_MeasAndPath.csv")
Data002_Meas = Data002_Meas.drop(['Unnamed: 0'], axis = 1)

In [None]:
import pandas as pd
import re

# -----------------------------
# 1. Select 500 smallest Patient_IDs
# -----------------------------
demo_ids = (
    Data002_Pat
    .sort_values("Patient_ID")
    .head(500)["Patient_ID"]
)

demo_pat = Data002_Pat[Data002_Pat["Patient_ID"].isin(demo_ids)].copy()
demo_chro = Data002_Chro[Data002_Chro["Patient_ID"].isin(demo_ids)].copy()

print("Demo cohort size:", demo_pat.shape[0])
print("Diagnosis rows:", demo_chro.shape[0])

# -----------------------------
# 2. Inspect raw diagnosis labels
# -----------------------------
raw_counts = (
    demo_chro["Category"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Raw label", "Category": "Count"})
)

raw_counts.head(10)


Demo cohort size: 500
Diagnosis rows: 47


Unnamed: 0,Count,count
0,Diabetes,16
1,ICD9:250,9
2,T2DM,7
3,ICD10: E11,6
4,High blood sugar,4
5,Chronic kidney disease,1
6,Atrial fibrillation,1
7,A-fib,1
8,CKD,1
9,AF,1


In [None]:
# -----------------------------
# 3. Explicit disease vocabularies
# -----------------------------
DIABETES_TERMS = [
    "diabetes",
    "t2dm",
    "icd9:250",
    "icd10: e11",
    "high blood sugar"
]

CKD_TERMS = [
    "chronic kidney disease",
    "ckd"
]

AF_TERMS = [
    "atrial fibrillation",
    "a-fib",
    "af"
]

def classify_disease(label):
    t = str(label).lower()
    if any(term in t for term in DIABETES_TERMS):
        return "Diabetes"
    if any(term in t for term in CKD_TERMS):
        return "CKD"
    if any(term in t for term in AF_TERMS):
        return "AF"
    return None

demo_chro["Disease"] = demo_chro["Category"].apply(classify_disease)

# Keep only recognised diseases
demo_chro = demo_chro.dropna(subset=["Disease"])


In [10]:
# -----------------------------
# 4. Collapse to patient-level flags (incorrect version)
# -----------------------------
patient_disease_flags = (
    demo_chro
    .groupby(["Patient_ID", "Disease"])
    .size()
    .unstack(fill_value=0)
)

# Convert counts → presence/absence
patient_disease_flags = (patient_disease_flags > 0).astype(int)

# Prevalence (% of patients)
prevalence = (
    patient_disease_flags
    .mean()
    .mul(100)
    .round(1)
    .rename("Prevalence (%)")
)


In [None]:
prevalence

Unnamed: 0_level_0,Prevalence (%)
Disease,Unnamed: 1_level_1
AF,6.5
CKD,4.3
Diabetes,91.3


In [11]:
# demo_ids must be the 500 smallest Patient_IDs selected earlier
# demo_chro must be filtered to those demo_ids and have a 'Disease' column

# -----------------------------
# 4. Collapse to patient-level flags (CORRECT DENOMINATOR = 500)
# -----------------------------
patient_disease_flags = (
    demo_chro
    .groupby(["Patient_ID", "Disease"])
    .size()
    .unstack(fill_value=0)
)

# Convert counts → presence/absence
patient_disease_flags = (patient_disease_flags > 0).astype(int)

# >>> Critical fix: reindex to include ALL 500 patients (including those with zero rows)
patient_disease_flags = patient_disease_flags.reindex(demo_ids.values, fill_value=0)
patient_disease_flags.index.name = "Patient_ID"

# Prevalence (% of the 500 patients)
prevalence = (
    patient_disease_flags
    .mean()
    .mul(100)
    .round(1)
    .rename("Prevalence (%)")
)

prevalence


Unnamed: 0_level_0,Prevalence (%)
Disease,Unnamed: 1_level_1
AF,0.6
CKD,0.4
Diabetes,8.4
