In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score,
                             roc_curve, precision_recall_fscore_support, accuracy_score)


In [17]:
df = pd.read_csv("Preprocessed ICU data.csv")
df.columns = df.columns.str.strip()

print("Available ICU columns:", [c for c in df.columns if "ICUType" in c])

icu3_df = df[df["ICUType3"] == 1].copy()
icu3_df.to_csv("ICU3_data.csv", index=False)
print(f"ICU3 data extracted. Rows: {icu3_df.shape[0]}, Columns: {icu3_df.shape[1]}")

icu4_df = df[df["ICUType4"] == 1].copy()
icu4_df.to_csv("ICU4_data.csv", index=False)
print(f"ICU4 data extracted. Rows: {icu4_df.shape[0]}, Columns: {icu4_df.shape[1]}")


Available ICU columns: ['ICUType2', 'ICUType3', 'ICUType4']
ICU3 data extracted. Rows: 690, Columns: 232
ICU4 data extracted. Rows: 342, Columns: 232


In [13]:
import pandas as pd
import numpy as np

# Step 1 — Load dataset
df = pd.read_csv("Preprocessed ICU data.csv")
df.columns = df.columns.str.strip()

# Step 2 — Extract ICU3 and ICU4 subsets
icu3 = df[df["ICUType3"] == 1].copy()
icu4 = df[df["ICUType4"] == 1].copy()

# Step 3 — Define important variables for comparison
cols_of_interest = [
    "Age", "Gender", "Mean_GCS.x", "Mean_HR.x", "Mean_SysABP.x", "Mean_Lactate.y", "In.hospital_death"
]
cols_existing = [c for c in cols_of_interest if c in df.columns]

# Step 4 — Calculate summary statistics
def summary_stats(data, name):
    stats = {
        "ICU Type": name,
        "Patients": len(data),
        "Deaths": data["In.hospital_death"].sum(),
        "Mortality Rate (%)": round(100 * data["In.hospital_death"].mean(), 2),
    }
    for col in cols_existing:
        if col not in ["Gender", "In.hospital_death"]:
            stats[f"Mean {col}"] = round(data[col].mean(), 2)
    return stats

icu3_stats = summary_stats(icu3, "ICU3")
icu4_stats = summary_stats(icu4, "ICU4")

# Step 5 — Combine into one comparison table
comparison_df = pd.DataFrame([icu3_stats, icu4_stats]).T
comparison_df.columns = ["ICU3", "ICU4"]
comparison_df


Unnamed: 0,ICU3,ICU4
ICU Type,ICU3,ICU4
Patients,690,342
Deaths,275,155
Mortality Rate (%),39.86,45.32
Mean Age,68.74,71.12
Mean Mean_GCS.x,11.31,10.01
Mean Mean_HR.x,90.33,84.27
Mean Mean_SysABP.x,115.37,125.33
Mean Mean_Lactate.y,2.27,2.06


In [3]:
CSV_PATH = "Preprocessed ICU data.csv"
TARGET = "In.hospital_death"
GENERAL_VARS = ["Age", "Gender", "Height", "SAPS.I", "SOFA"]  # keep if present
RANDOM_STATE = 42

def load_data(path=CSV_PATH):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip()
    if TARGET not in df.columns:
        raise ValueError(f"Target '{TARGET}' not found. Available: {list(df.columns)[:12]}")
    return df

def prepare_subset_mean(df: pd.DataFrame, icu_type: int,
                        missing_thresh: float = 0.5):
    """
    Filter ICU type, keep only Mean_* features + general vars + target.
    Drop columns with >missing_thresh missingness and all-zero columns.
    Return X_train, X_test, y_train, y_test, feature_names.
    """
    icu_col = f"ICUType{icu_type}"
    if icu_col not in df.columns:
        raise ValueError(f"{icu_col} not found in columns.")

    sub = df[df[icu_col] == 1].copy()
    mean_cols = [c for c in sub.columns if c.startswith("Mean_")]
    keep_cols = [c for c in GENERAL_VARS if c in sub.columns] + mean_cols + [TARGET]

    use = sub[keep_cols].copy()

    # drop columns with too many NaNs
    keep_after_missing = use.columns[use.isna().mean() <= missing_thresh]
    use = use[keep_after_missing]

    # drop all-zero columns (after filling NaN with 0 to test zeros)
    all_zero = [c for c in use.columns if c != TARGET and (use[c].fillna(0).abs().sum() == 0)]
    if all_zero:
        use = use.drop(columns=all_zero)

    # separate X, y (numeric only for X)
    y = use[TARGET].astype(int)
    X = use.drop(columns=[TARGET])
    X = X.select_dtypes(include=[np.number])

    # split before imputation to avoid leakage
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
    )

    # median imputation (fit on train only)
    med = X_tr.median()
    X_tr = X_tr.fillna(med)
    X_te = X_te.fillna(med)

    return X_tr, X_te, y_tr, y_te, X.columns.tolist()


In [5]:
def train_eval_lr(X_tr, X_te, y_tr, y_te):
    pipe_lr = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000, class_weight="balanced", solver="liblinear"))
    ])
    param_lr = {"clf__C": [0.05, 0.1, 0.5, 1.0, 3.0, 10.0]}
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    gs_lr = GridSearchCV(pipe_lr, param_grid=param_lr, scoring="roc_auc", cv=cv, n_jobs=-1)
    gs_lr.fit(X_tr, y_tr)

    best_lr = gs_lr.best_estimator_
    y_pred = best_lr.predict(X_te)
    y_prob = best_lr.predict_proba(X_te)[:, 1]

    acc = accuracy_score(y_te, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary", zero_division=0)
    auc = roc_auc_score(y_te, y_prob)

    # coefficients in standardized space
    scaler = best_lr.named_steps["scaler"]
    clf = best_lr.named_steps["clf"]
    coefs = pd.Series(clf.coef_[0], index=X_tr.columns).sort_values(key=np.abs, ascending=False)

    report = {
        "best_params": gs_lr.best_params_,
        "metrics": {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc},
        "confusion_matrix": confusion_matrix(y_te, y_pred),
        "classification_report": classification_report(y_te, y_pred, zero_division=0),
        "proba": y_prob,
        "coef_std": coefs
    }
    return best_lr, report

def train_eval_rf(X_tr, X_te, y_tr, y_te):
    pipe_rf = Pipeline([
        ("scaler", StandardScaler()),  # harmless; keeps interface consistent
        ("clf", RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced"))
    ])
    param_rf = {
        "clf__n_estimators": [300, 600],
        "clf__max_depth": [None, 8, 16],
        "clf__min_samples_leaf": [1, 2, 4],
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    gs_rf = GridSearchCV(pipe_rf, param_grid=param_rf, scoring="roc_auc", cv=cv, n_jobs=-1)
    gs_rf.fit(X_tr, y_tr)

    best_rf = gs_rf.best_estimator_
    y_pred = best_rf.predict(X_te)
    y_prob = best_rf.predict_proba(X_te)[:, 1]

    acc = accuracy_score(y_te, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary", zero_division=0)
    auc = roc_auc_score(y_te, y_prob)

    # feature importances
    rf = best_rf.named_steps["clf"]
    importances = pd.Series(rf.feature_importances_, index=X_tr.columns).sort_values(ascending=False)

    report = {
        "best_params": gs_rf.best_params_,
        "metrics": {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc},
        "confusion_matrix": confusion_matrix(y_te, y_pred),
        "classification_report": classification_report(y_te, y_pred, zero_division=0),
        "proba": y_prob,
        "feature_importances": importances
    }
    return best_rf, report


In [7]:
df = load_data()

results = {}

for icu in [3, 4]:
    print(f"\n================ ICU Type {icu} (Mean-only features) ================\n")
    X_tr, X_te, y_tr, y_te, feat_names = prepare_subset_mean(df, icu_type=icu, missing_thresh=0.5)
    print(f"Train shape: {X_tr.shape}, Test shape: {X_te.shape}, Positives in train: {y_tr.sum()} ({y_tr.mean():.1%})")

    # Logistic Regression
    best_lr, rep_lr = train_eval_lr(X_tr, X_te, y_tr, y_te)
    print("\n--- Logistic Regression (ICU", icu, ") ---")
    print("Best params:", rep_lr["best_params"])
    print("Metrics:", rep_lr["metrics"])
    print("Confusion Matrix:\n", rep_lr["confusion_matrix"])
    print("Classification Report:\n", rep_lr["classification_report"])
    print("Top coefficients (standardized):\n", rep_lr["coef_std"].head(15))

    # Random Forest
    best_rf, rep_rf = train_eval_rf(X_tr, X_te, y_tr, y_te)
    print("\n--- Random Forest (ICU", icu, ") ---")
    print("Best params:", rep_rf["best_params"])
    print("Metrics:", rep_rf["metrics"])
    print("Confusion Matrix:\n", rep_rf["confusion_matrix"])
    print("Classification Report:\n", rep_rf["classification_report"])
    print("Top feature importances:\n", rep_rf["feature_importances"].head(15))

    results[icu] = {"LR": rep_lr, "RF": rep_rf}




Train shape: (483, 79), Test shape: (207, 79), Positives in train: 193 (40.0%)

--- Logistic Regression (ICU 3 ) ---
Best params: {'clf__C': 0.05}
Metrics: {'accuracy': 0.6859903381642513, 'precision': 0.5955056179775281, 'recall': 0.6463414634146342, 'f1': 0.6198830409356725, 'auc': 0.7265365853658536}
Confusion Matrix:
 [[89 36]
 [29 53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.71      0.73       125
           1       0.60      0.65      0.62        82

    accuracy                           0.69       207
   macro avg       0.67      0.68      0.68       207
weighted avg       0.69      0.69      0.69       207

Top coefficients (standardized):
 Mean_GCS.y         -0.464953
Mean_Mg.y           0.356953
Mean_WBC.y          0.313779
Mean_BUN.y          0.259301
Mean_Lactate.y      0.231454
Mean_Mg.x          -0.227409
Mean_PaO2.y        -0.178331
Mean_TroponinI.y    0.178250
Mean_SaO2.x         0.177369
Mean_BUN.

In [9]:
# Build a compact comparison of key metrics
summary_rows = []
for icu in [3, 4]:
    for model in ["LR", "RF"]:
        m = results[icu][model]["metrics"]
        summary_rows.append({
            "ICUType": icu,
            "Model": model,
            "AUC": round(m["auc"], 3),
            "Recall": round(m["recall"], 3),
            "Precision": round(m["precision"], 3),
            "F1": round(m["f1"], 3),
            "Accuracy": round(m["accuracy"], 3)
        })
summary_df = pd.DataFrame(summary_rows).sort_values(["ICUType", "Model"])
summary_df


Unnamed: 0,ICUType,Model,AUC,Recall,Precision,F1,Accuracy
0,3,LR,0.727,0.646,0.596,0.62,0.686
1,3,RF,0.729,0.488,0.667,0.563,0.7
2,4,LR,0.684,0.596,0.583,0.589,0.621
3,4,RF,0.796,0.617,0.674,0.644,0.689


In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Step 1 — Load dataset
df = pd.read_csv("Preprocessed ICU data.csv")
df.columns = df.columns.str.strip()

# Step 2 — Function to process one ICU type
def top10_features_by_importance(df, icu_type):
    icu_col = f"ICUType{icu_type}"
    df_sub = df[df[icu_col] == 1].copy()

    # Keep only Mean_ columns and target
    mean_cols = [c for c in df_sub.columns if c.startswith("Mean_")]
    df_sub = df_sub[mean_cols + ["In.hospital_death"]].copy()
    df_sub = df_sub.dropna(axis=1, thresh=int(0.5*len(df_sub)))  # drop >50% NaN cols
    df_sub = df_sub.fillna(df_sub.median())  # fill missing with median

    # Separate X and y
    X = df_sub.drop(columns=["In.hospital_death"])
    y = df_sub["In.hospital_death"].astype(int)

    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=500, random_state=42, class_weight="balanced")
    rf.fit(X, y)

    # Feature importance
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    top10 = importances.sort_values(ascending=False).head(10)

    # Compute mean values of these top10 features
    mean_values = df_sub[top10.index].mean().round(2)
    result = pd.DataFrame({
        "Feature": top10.index,
        "Importance": top10.values.round(4),
        "Mean Value": mean_values.values
    }).reset_index(drop=True)
    return result

# Step 3 — Run for ICU3 and ICU4
icu3_top10 = top10_features_by_importance(df, 3)
icu4_top10 = top10_features_by_importance(df, 4)

# Step 4 — Display results
print("========== ICU3 Top 10 Mortality-Related Features ==========")
display(icu3_top10)

print("\n========== ICU4 Top 10 Mortality-Related Features ==========")
display(icu4_top10)




Unnamed: 0,Feature,Importance,Mean Value
0,Mean_GCS.y,0.0391,11.35
1,Mean_HR.y,0.0287,88.47
2,Mean_Urine.y,0.025,110.87
3,Mean_NISysABP.y,0.0246,116.77
4,Mean_HR.x,0.0233,90.33
5,Mean_WBC.y,0.0227,12.82
6,Mean_Lactate.y,0.0202,2.27
7,Mean_Mg.y,0.0201,2.04
8,Mean_NIMAP.y,0.0193,75.18
9,Mean_Urine.x,0.0183,120.1





Unnamed: 0,Feature,Importance,Mean Value
0,Mean_GCS.y,0.0731,10.31
1,Mean_GCS.x,0.0634,10.01
2,Mean_Glucose.y,0.0322,127.16
3,Mean_Urine.y,0.0275,111.73
4,Mean_PaO2.x,0.024,154.2
5,Mean_PaO2.y,0.0229,128.03
6,Mean_NIMAP.y,0.0212,77.99
7,Mean_PaCO2.x,0.0195,39.25
8,Mean_MAP.x,0.0188,84.04
9,Mean_Mg.y,0.0187,2.01
