In [4]:
# ==============================
# Realistic Synthetic Patient Bed Dataset
# ==============================

import numpy as np
import pandas as pd
import random

np.random.seed(42)

N = 5000

def clip(v, low, high):
    return max(low, min(v, high))

def generate_condition(o2, hr, temp):
    if o2 < 88 or hr > 125 or temp > 39.5:
        return "critical"
    elif o2 < 94 or hr > 105 or temp > 38:
        return "stable"
    else:
        return "improving"

def generate_priority(o2, hr, temp, bp_sys):
    score = 0

    # Oxygen contribution
    if o2 < 85:
        score += 3
    elif o2 < 90:
        score += 2
    elif o2 < 94:
        score += 1

    # Heart rate contribution
    if hr > 130:
        score += 2
    elif hr > 110:
        score += 1

    # Temperature contribution
    if temp > 39.5:
        score += 2
    elif temp > 38:
        score += 1

    # Blood pressure contribution
    if bp_sys < 85:
        score += 2
    elif bp_sys < 95:
        score += 1

    # Clinical uncertainty (randomness)
    score += np.random.choice([0, 0, 1])

    if score >= 5:
        return 2
    elif score >= 3:
        return 1
    else:
        return 0

data = []

for i in range(N):
    temp = clip(np.random.normal(37.2, 1.5), 34, 41)
    hr = clip(int(np.random.normal(90, 25)), 40, 180)
    bp_sys = clip(int(np.random.normal(115, 25)), 60, 200)
    bp_dia = clip(int(np.random.normal(75, 15)), 40, 130)
    o2 = clip(int(np.random.normal(95, 6)), 70, 100)

    condition = generate_condition(o2, hr, temp)
    priority = generate_priority(o2, hr, temp, bp_sys)

    # Doctor notes correlated with severity
    if priority == 2:
        notes = random.choice([
            "severe breathing difficulty",
            "unconscious on arrival",
            "oxygen support needed",
            "critical chest pain",
            "possible cardiac arrest"
        ])
    elif priority == 1:
        notes = random.choice([
            "requires monitoring",
            "moderate fever",
            "irregular heartbeat",
            "mild respiratory distress"
        ])
    else:
        notes = random.choice([
            "patient stable",
            "recovering well",
            "routine observation",
            "no major complaints"
        ])

    data.append([
        f"P{i+1}",
        round(temp,1), hr, bp_sys, bp_dia, o2,
        condition,
        notes,
        priority
    ])

columns = [
    "Patient_ID", "Temperature", "Heart_Rate",
    "BP_Systolic", "BP_Diastolic",
    "Oxygen_Level", "Overall_Condition",
    "Doctor_Notes", "Priority"
]

df = pd.DataFrame(data, columns=columns)

# ðŸ”¥ Add 4% label noise (real-world mis-triage)
noise_idx = np.random.choice(df.index, size=int(0.04*N), replace=False)
df.loc[noise_idx, "Priority"] = np.random.choice([0,1,2], size=len(noise_idx))

df.head()

Unnamed: 0,Patient_ID,Temperature,Heart_Rate,BP_Systolic,BP_Diastolic,Oxygen_Level,Overall_Condition,Doctor_Notes,Priority
0,P1,37.9,86,131,97,93,stable,no major complaints,0
1,P2,36.8,40,102,80,89,stable,patient stable,0
2,P3,36.3,76,100,61,79,critical,requires monitoring,1
3,P4,38.6,40,130,71,94,stable,patient stable,0
4,P5,37.3,54,101,76,88,stable,irregular heartbeat,1


In [5]:
# =====================================
# Optimized ML Pipeline - XGBoost
# =====================================

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import numpy as np

# Drop ID
df_model = df.drop(columns=["Patient_ID"])

X = df_model.drop("Priority", axis=1)
y = df_model["Priority"]

# Feature groups
numeric_features = [
    "Temperature", "Heart_Rate",
    "BP_Systolic", "BP_Diastolic",
    "Oxygen_Level"
]

categorical_features = ["Overall_Condition"]
text_feature = "Doctor_Notes"

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

text_transformer = TfidfVectorizer(max_features=100)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("txt", text_transformer, text_feature)
    ]
)

# Model
model = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42
)

# Full Pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Hyperparameter Tuning
param_grid = {
    "classifier__n_estimators": [200, 300, 400],
    "classifier__max_depth": [4, 6, 8],
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__subsample": [0.8, 1.0],
    "classifier__colsample_bytree": [0.8, 1.0]
}

search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=15,
    cv=3,
    scoring="f1_weighted",
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_model = search.best_estimator_

# Evaluation
y_pred = best_model.predict(X_test)

print("Best Parameters:", search.best_params_)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 15 candidates, totalling 45 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'classifier__subsample': 0.8, 'classifier__n_estimators': 300, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.05, 'classifier__colsample_bytree': 0.8}

Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       662
           1       0.99      0.98      0.98       270
           2       0.96      0.79      0.87        68

    accuracy                           0.97      1000
   macro avg       0.97      0.92      0.95      1000
weighted avg       0.97      0.97      0.97      1000


Confusion Matrix:

[[657   3   2]
 [  6 264   0]
 [ 14   0  54]]
