In [None]:
import os
import dill
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (matthews_corrcoef,auc,roc_auc_score, average_precision_score, accuracy_score,
                             precision_score, recall_score, f1_score, brier_score_loss,
                             roc_curve, precision_recall_curve)
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

import shap
import matplotlib.pyplot as plt

In [None]:
# Dictionary 
D_ICD_DIAGNOSES = pd.read_csv('/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz')
D_ICD_DIAGNOSES.columns = D_ICD_DIAGNOSES.columns.str.upper()
D_ICD_DIAGNOSES['ICD_VERSION'] = 9
D_ICD_DIAGNOSES = D_ICD_DIAGNOSES[['ICD9_CODE','ICD_VERSION','LONG_TITLE']]
D_ICD_DIAGNOSES.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']
print(D_ICD_DIAGNOSES.shape)

D_4_DIAGNOSES = pd.read_csv('C:/MIMIC IV/3.1/hosp/D_ICD_DIAGNOSES.csv.gz')
D_4_DIAGNOSES.columns = D_4_DIAGNOSES.columns.str.upper()
D_4_DIAGNOSES.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']
print(D_4_DIAGNOSES.shape)

D_ICD_DIAGNOSES = pd.concat([D_ICD_DIAGNOSES,D_4_DIAGNOSES])
D_ICD_DIAGNOSES = D_ICD_DIAGNOSES.drop_duplicates(keep='first')
print(D_ICD_DIAGNOSES.shape)

d_iii_icd_procedures = pd.read_csv('/mimic-iii-clinical-database-1.4/d_icd_procedures.csv.gz')
d_iii_icd_procedures = d_iii_icd_procedures.rename(columns={'ICD9_CODE':'ICD_CODE'})
d_iii_icd_procedures = d_iii_icd_procedures.rename(columns={'LONG_TITLE':'ICD_TEXT'})
d_iii_icd_procedures['ICD_VERSION'] = 9
d_iii_icd_procedures = d_iii_icd_procedures[['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']]
d_iii_icd_procedures['ICD_CODE'] = d_iii_icd_procedures['ICD_CODE'].astype(str)

d_iv_icd_procedures = pd.read_csv('C:/MIMIC IV/3.1/hosp/d_icd_procedures.csv.gz')
d_iv_icd_procedures.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']
d_iv_icd_procedures['ICD_CODE'] = d_iv_icd_procedures['ICD_CODE'].astype(str)

P_ICD_procedures = pd.concat([d_iii_icd_procedures,d_iv_icd_procedures])
P_ICD_procedures = P_ICD_procedures.drop_duplicates(keep='first')

dd = P_ICD_procedures[P_ICD_procedures.duplicated(subset=['ICD_TEXT'],keep=False)].sort_values(by='ICD_TEXT')
dd = (
    dd.groupby('ICD_TEXT')
    .agg(
        final_ICD_CODE=('ICD_CODE', lambda x: max(x, key=len)), 
        all_ICD_CODEs=('ICD_CODE', lambda x: list(x))            
    )
    .reset_index()
)

P_ICD_procedures = (
    P_ICD_procedures.assign(code_len=P_ICD_procedures['ICD_CODE'].str.len())
    .sort_values('code_len', ascending=False)
    .drop_duplicates(subset=['ICD_TEXT'], keep='first')
    .drop(columns='code_len')
)

print(P_ICD_procedures.ICD_VERSION.value_counts())

In [None]:
def sclar_coder(rundf,num_v,cate_v):
    scaler = MinMaxScaler()
    rundf[num_v] = scaler.fit_transform(rundf[num_v])
    
    label_encoder = LabelEncoder()
    for col in cate_v:
        rundf[col] = label_encoder.fit_transform(rundf[col])

    return rundf

def print_top_features(all_importances, D_ICD_DIAGNOSES, P_ICD_procedures, sort_by='xgboost', top_n=20):

    # Print header
    print(f"---------- Top {top_n} Feature Importances (Sorted by {sort_by}) ----------")

    # Sort features by specified model and get top N
    top_features = all_importances.sort_values(by=sort_by, ascending=False).Feature.head(top_n).values

    for feature in top_features:
        if '_' in feature:
            try:
                # Split feature name and use the second part as ICD_CODE
                icd_code = feature.split('_')[-1]
                
                # Check D_ICD_DIAGNOSES
                diag_match = D_ICD_DIAGNOSES[D_ICD_DIAGNOSES['ICD_CODE'] == icd_code]
                if not diag_match.empty:
                    print(f"{feature}: {diag_match['ICD_TEXT'].values[0]}")
                
                # Check P_ICD_procedures
                proc_match = P_ICD_procedures[P_ICD_procedures['ICD_CODE'] == icd_code]
                if not proc_match.empty:
                    print(f"{feature}: {proc_match['ICD_TEXT'].values[0]}")
                
                # If no match found in either DataFrame
                if diag_match.empty and proc_match.empty:
                    print(f"{feature}: No matching ICD code found")
                    
            except IndexError:
                print(f"{feature}: No valid ICD code (malformed feature name)")
        else:
            print(feature)
    return top_features

## downsampling

In [None]:
label = ['LOS_Hospital', 'DIEINHOSPITAL', 'Readmission_30', 'Multiple_ICUs', 'sepsis_all', 'FirstICU24_AKI_ALL', 'ICU_within_12hr_of_admit']

In [None]:
def get_minority_mask(df, labels):

    mask = pd.Series(False, index=df.index)
    
    for col in labels:
        value_counts = df[col].value_counts()
        if len(value_counts) < 2:
            print(f"Warning: Column {col} has fewer than 2 classes, skipping")
            continue

        minority_class = value_counts.idxmin()

        mask |= (df[col] == minority_class)
    
    return mask

In [None]:
mask = get_minority_mask(III, label)
label_III = III[mask]
print(label_III.shape)

mask = get_minority_mask(IV, label)
label_IV = IV[mask]
print(label_IV.shape)

In [None]:
for i in label:
    print(i,Counter(III[i]))

In [None]:
for i in label:
    print(i,Counter(label_IV[i]), label_IV[i].mean())

In [None]:
for i in label:
    print(i,Counter(label_III[i]), label_III[i].mean())

In [None]:
ft = basic + Diag + Proc + Med + TS
print(len(ft))

In [None]:
this_down = label_III.copy()
minority_downsampled = this_down[this_down.DIEINHOSPITAL==1].sample(n=2000, random_state=42)
minority_read = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
minority_read = minority_read[minority_read.Readmission_30==1].sample(n=1000, random_state=42)
minority_downsampled = pd.concat([minority_downsampled,minority_read])
minority_icus = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
minority_icus = minority_icus[minority_icus.Multiple_ICUs == 1].sample(n=1000, random_state=42)
minority_downsampled = pd.concat([minority_downsampled,minority_icus])

majority_downsampled = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
majority_downsampled = majority_downsampled[majority_downsampled.DIEINHOSPITAL==0].sample(n=len(minority_downsampled), random_state=42)
III_resampled = pd.concat([minority_downsampled,majority_downsampled])
III_resampled.shape

In [None]:
III_resampled.head(2)

In [None]:
for i in label:
    print(i,Counter(label_III[i]), label_III[i].mean())

In [None]:
for i in label:
    print(i,Counter(III_resampled[i]), III_resampled[i].mean())

In [None]:
this_down = label_IV.copy()
minority_downsampled = this_down[this_down.DIEINHOSPITAL==1].sample(n=2000, random_state=42)
minority_read = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
minority_read = minority_read[minority_read.Readmission_30==1].sample(n=1000, random_state=42)
minority_downsampled = pd.concat([minority_downsampled,minority_read])
minority_icus = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
minority_icus = minority_icus[minority_icus.Multiple_ICUs == 1].sample(n=1000, random_state=42)
minority_downsampled = pd.concat([minority_downsampled,minority_icus])

majority_downsampled = this_down[~(this_down.ICUSTAY_ID.isin(minority_downsampled.ICUSTAY_ID))]
majority_downsampled = majority_downsampled[majority_downsampled.DIEINHOSPITAL==0].sample(n=len(minority_downsampled), random_state=42)
IV_resampled = pd.concat([minority_downsampled,majority_downsampled])
IV_resampled.shape

In [None]:
IV_resampled.head(2)

In [None]:
for i in label:
    print(i,Counter(label_IV[i]), label_IV[i].mean())

In [None]:
for i in label:
    print(i,Counter(IV_resampled[i]), IV_resampled[i].mean())

In [None]:
New_IV_III_downsampled = pd.concat([III_resampled,IV_resampled])
New_IV_III_downsampled.shape

In [None]:
New_IV_III_downsampled[New_IV_III_downsampled['ICUSTAY_ID'].duplicated(keep=False)]

In [None]:
New_IV_III_downsampled['ICUSTAY_ID'].duplicated().any()

In [None]:
for i in label:
    print(i,Counter(New_IV_III_downsampled[New_IV_III_downsampled.MIMIC == 'IV'][i]))

In [None]:
New_IV_III_downsampled.head(2)

In [None]:
New_IV_III_downsampled.to_csv('/New_IV_III_downsampled.csv',index=False)

In [None]:
New_all_Test = pd.DataFrame()

In [None]:
Train, Test = train_test_split(New_IV_III_downsampled[New_IV_III_downsampled.MIMIC=='III'], test_size=0.3, random_state=42)
Valid, Test = train_test_split(Test, test_size=0.25, random_state=42)
print(Train.shape,Valid.shape,Test.shape)

In [None]:
New_all_Test = pd.concat([New_all_Test,Test])
New_all_Test.shape

In [None]:
Train, Test = train_test_split(New_IV_III_downsampled[New_IV_III_downsampled.MIMIC=='IV'], test_size=0.3, random_state=42)
Valid, Test = train_test_split(Test, test_size=0.25, random_state=42)
print(Train.shape,Valid.shape,Test.shape)

In [None]:
New_all_Test = pd.concat([New_all_Test,Test])
New_all_Test.shape

In [None]:
New_all_Test[New_all_Test.MIMIC=='III'].shape

In [None]:
New_all_Test[New_all_Test.MIMIC=='IV'].shape

In [None]:
New_all_Test.to_csv('/!New_all_Test.csv',index=False)

In [None]:
New_all_Test['ICUSTAY_ID'].duplicated().any()

In [None]:
for i in label:
    print(i,Counter(New_all_Test[i]), New_all_Test[i].mean())

## ML

In [3]:
class BaseModel:
    @staticmethod
    def get_classifier(model_type='random_forest',label_column=None, params=None):
        """Return a classifier based on model_type with given parameters."""
        if params is None:
            params = {}
        if model_type == 'Random Forest':
            return RandomForestClassifier(**params)
        elif model_type == 'Decision Tree':
            return DecisionTreeClassifier(**params)
        elif model_type == 'Adaboost':
            return AdaBoostClassifier(**params)
        elif model_type == 'XGboost':
            return XGBClassifier(**params)
        elif model_type == 'LR':
            return LogisticRegression(**params)
        elif model_type == 'SVM':
            return SVC(**params)
        elif model_type == 'MLP':
            return MLPClassifier(**params)
        elif model_type == 'ensemble':
            base_models = [
                ('rf', RandomForestClassifier()),
                ('xgb', XGBClassifier(eval_metric='mlogloss'))
            ]
            base_models = [(f'model_{i}', model) for i, model in enumerate(base_models)]
            return VotingClassifier(estimators=base_models, voting='soft', n_jobs=1)
        else:
            raise ValueError(f"Unknown model_type: {model_type}")

    @staticmethod
    def bootstrap(train, valid, test, label_column, models, n=10, confidence=0.95):
        """Perform bootstrap cross-validation, evaluate models"""
        # Validate inputs
        if label_column not in train.columns or label_column not in test.columns:
            raise ValueError(f"Label column '{label_column}' not found in train or test DataFrame.")


        skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42)
        features = train.columns.difference([label_column])

        all_scores = []
        all_importances = []
        all_AUROC = []
        all_AUPRC = []

        for model_type, params in models:
            scores_model = []
            importances_model = pd.DataFrame({'Feature': features})
            this_AUROC = []
            this_AUPRC = []

            for fold, (train_index, val_index) in enumerate(tqdm(skf.split(train, train[label_column]), total=n, desc=f'Model: {model_type}', ascii=True)):
                # Use train and validation splits from StratifiedKFold
                X_train = train.iloc[train_index][features]
                y_train = train.iloc[train_index][label_column]
                X_val = valid[features]
                y_val = valid[label_column]
                X_test = pd.concat([test[features], train.iloc[val_index][features]])
                y_test = pd.concat([test[label_column], train.iloc[val_index][label_column]])

                # Initialize and train the model
                est = BaseModel.get_classifier(model_type, label_column, params)
                if model_type == 'autogluon':
                    train_data = train.iloc[train_index][list(features) + [label_column]].copy()
                    est.fit(train_data, hyperparameter_tune_kwargs='auto')
                else:
                    if model_type == 'tabnet':
                        est.fit(
                            X_train.to_numpy(), y_train,
                            eval_set=[(X_val.to_numpy(), y_val)],
                            eval_metric=['auc']
                        )
                    else:
                        est.fit(X_train, y_train)

                # Get predictions and probabilities
                if model_type == 'autogluon':
                    y_pred = est.predict(test[list(features) + [label_column]])
                    y_prob = est.predict_proba(test[list(features) + [label_column]]).iloc[:, 1].values
                elif model_type == 'tabnet':
                    y_pred = est.predict(X_test.to_numpy())
                    y_prob = np.zeros_like(y_pred)
                else:
                    y_pred = est.predict(X_test)
                    try:
                        y_prob = est.predict_proba(X_test)[:, 1]
                    except AttributeError:
                        y_prob = np.zeros_like(y_pred)

                # Evaluate AUROC
                fpr, tpr, _ = roc_curve(y_test, y_prob)
                roc_auc = auc(fpr, tpr)
                this_AUROC.append({'roc_auc': roc_auc, 'fpr': fpr, 'tpr': tpr})

                # Evaluate AUPRC
                precision, recall, _ = precision_recall_curve(y_test, y_prob)
                prc_auc = auc(recall, precision)
                this_AUPRC.append({'prc_auc': prc_auc, 'precision': precision, 'recall': recall})

                # Compute other metrics
                accuracy = accuracy_score(y_test, y_pred)
                precision_score_val = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                recall_score_val = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
                mcc = matthews_corrcoef(y_test, y_pred)
                brier = brier_score_loss(y_test, y_prob) if y_prob is not None else np.nan

                # Store evaluation metrics
                scores_model.append({
                    'ROC_AUC': roc_auc,
                    'PRC_AUC': prc_auc,
                    'Accuracy': accuracy,
                    'Precision': precision_score_val,
                    'Recall': recall_score_val,
                    'F1': f1,
                    'MCC': mcc,
                    'Brier': brier
                })

                # Calculate feature importance
                if model_type == 'autogluon':
                    importance = est.feature_importance(X_test).importance.values
                elif hasattr(est, 'feature_importances_'):
                    importance = est.feature_importances_
                elif hasattr(est, 'coef_') and est.coef_.ndim == 1:
                    importance = np.abs(est.coef_)
                else:
                    importance = np.zeros(len(features))

                importances_model[f'fold_{fold}'] = importance

            # Aggregate scores
            scores_model = pd.DataFrame(scores_model)
            all_scores.append(scores_model)

            # Aggregate feature importance
            importances_model[f'{model_type}_mean'] = importances_model.filter(like='fold_').mean(axis=1)
            all_importances.append(importances_model[['Feature', f'{model_type}_mean']])

            # Calculate confidence intervals for AUROC and AUPRC
            lower_bound = (1 - confidence) / 2
            upper_bound = 1 - lower_bound

            auroc_vals = [x['roc_auc'] for x in this_AUROC]
            auprc_vals = [x['prc_auc'] for x in this_AUPRC]
            auroc_ci = (np.percentile(auroc_vals, lower_bound * 100), np.percentile(auroc_vals, upper_bound * 100))
            auprc_ci = (np.percentile(auprc_vals, lower_bound * 100), np.percentile(auprc_vals, upper_bound * 100))

            all_AUROC.append({
                'model': model_type,
                'mean': np.mean(auroc_vals),
                'ci_lower': auroc_ci[0],
                'ci_upper': auroc_ci[1],
                'curves': this_AUROC
            })
            all_AUPRC.append({
                'model': model_type,
                'mean': np.mean(auprc_vals),
                'ci_lower': auprc_ci[0],
                'ci_upper': auprc_ci[1],
                'curves': this_AUPRC
            })


        # Combine all scores
        all_scores_df = pd.concat([df.assign(Model=model_type) for (model_type, _), df in zip(models, all_scores)], ignore_index=True)
        all_scores_summary = all_scores_df.groupby('Model').agg(
            {col: lambda x: f"{np.mean(x):.4f} ({np.percentile(x, lower_bound * 100):.4f}-{np.percentile(x, upper_bound * 100):.4f})"
             for col in all_scores_df.columns if col != 'Model'}
        ).reset_index()

        # Combine all importances
        all_importances_df = pd.concat([df.rename(columns={f'{model_type}_mean': model_type}) for (model_type, _), df in zip(models, all_importances)], axis=1)
        all_importances_df = all_importances_df.loc[:, ~all_importances_df.columns.duplicated()]

        return all_AUROC, all_AUPRC, all_scores_summary, all_importances_df

In [None]:
III_IV = pd.read_csv('/New_IV_III_downsampled.csv')
print(III_IV.shape)

basic = ['GENDER', 'ADMISSION_TYPE', 'FIRST_CAREUNIT', 'AGE']
groups = ['Group_Va_uti','Group_AKI', 'Group_CKD', 'Group_PCOS', 'Group_Neoplasm_ovary', 'Group_Endometriosis', 'Group_Leiomyoma']

full_Diag = groups + ['Diag_Acinetobacter spp.', 'Diag_Enterobacteriaceae','Diag_Enterococcus spp.', 'Diag_Pseudomonas aeruginosa', 'Diag_Staphylococcus aureus','Diag_MDRObeforeICU','Diag_ATE_filtered','Diag_Acute respiratory failure','Diag_Do not resuscitate status', 'Diag_Encounter for palliative care','Diag_Hyperosmolality and|or hypernatremia','Diag_MYOCARDIAL INFARCT', 'Diag_CONGESTIVE HEART FAILURE', 'Diag_PERIPHERAL VASCULAR DISEASE', 'Diag_CEREBROVASCULAR DISEASE', 'Diag_DEMENTIA', 'Diag_CHRONIC PULMONARY DISEASE', 'Diag_RHEUMATIC DISEASE', 'Diag_PEPTIC ULCER DISEASE', 'Diag_MILD LIVER DISEASE', 'Diag_DIABETES WITHOUT CC', 'Diag_DIABETES WITH CC', 'Diag_PARAPLEGIA', 'Diag_RENAL DISEASE', 'Diag_MALIGNANT CANCER', 'Diag_SEVERE LIVER DISEASE', 'Diag_METASTATIC SOLID TUMOR', 'Diag_AIDS', 'Diag_CHARLSON COMORBIDITY INDEX','Diag_9_2113', 'Diag_9_39891', 'Diag_9_43310', 'Diag_9_5570', 'Diag_9_64891', 'Diag_9_66411', 'Diag_9_6930', 'Diag_9_7464', 'Diag_9_7470', 'Diag_9_86121', 'Diag_9_99702', 'Diag_9_9971', 'Diag_9_9973', 'Diag_9_99731', 'Diag_9_9974', 'Diag_9_9975', 'Diag_10_A04', 'Diag_10_A40', 'Diag_10_B15', 'Diag_10_B20', 'Diag_10_B37', 'Diag_10_B95', 'Diag_10_C22', 'Diag_10_C33', 'Diag_10_C61', 'Diag_10_C78', 'Diag_10_C79', 'Diag_10_C82', 'Diag_10_D50', 'Diag_10_D61', 'Diag_10_D64', 'Diag_10_D66', 'Diag_10_D69', 'Diag_10_D70', 'Diag_10_E11', 'Diag_10_E22', 'Diag_10_E28', 'Diag_10_E43', 'Diag_10_E44', 'Diag_10_E66', 'Diag_10_E78', 'Diag_10_E83', 'Diag_9_E8490', 'Diag_9_E8497', 'Diag_9_E8498', 'Diag_9_E8499', 'Diag_10_E87', 'Diag_9_E8780', 'Diag_9_E8781', 'Diag_9_E8782', 'Diag_9_E8786', 'Diag_9_E8788', 'Diag_9_E8790', 'Diag_9_E8798', 'Diag_9_E8859', 'Diag_10_E89', 'Diag_9_E9320', 'Diag_9_E9331', 'Diag_9_E9342', 'Diag_9_E9478', 'Diag_10_F04', 'Diag_10_F05', 'Diag_10_F10', 'Diag_10_F19', 'Diag_10_F30', 'Diag_10_F32', 'Diag_10_F41', 'Diag_10_F43', 'Diag_10_G20', 'Diag_10_G30', 'Diag_10_G35', 'Diag_10_G40', 'Diag_10_G43', 'Diag_10_G44', 'Diag_10_G47', 'Diag_10_G60', 'Diag_10_G61', 'Diag_10_G81', 'Diag_10_G89', 'Diag_10_G93', 'Diag_10_G97', 'Diag_10_H35', 'Diag_10_H40', 'Diag_10_I07', 'Diag_10_I08', 'Diag_10_I10', 'Diag_10_I12', 'Diag_10_I20', 'Diag_10_I21', 'Diag_10_I24', 'Diag_10_I25', 'Diag_10_I26', 'Diag_10_I27', 'Diag_10_I31', 'Diag_10_I33', 'Diag_10_I34', 'Diag_10_I42', 'Diag_10_I44', 'Diag_10_I47', 'Diag_10_I50', 'Diag_10_I60', 'Diag_10_I61', 'Diag_10_I62', 'Diag_10_I66', 'Diag_10_I67', 'Diag_10_I69', 'Diag_10_I70', 'Diag_10_I71', 'Diag_10_I73', 'Diag_10_I82', 'Diag_10_I85', 'Diag_10_I95', 'Diag_10_J15', 'Diag_10_J41', 'Diag_10_J43', 'Diag_10_J44', 'Diag_10_J45', 'Diag_10_J69', 'Diag_10_J84', 'Diag_10_J93', 'Diag_10_J95', 'Diag_10_J98', 'Diag_10_K22', 'Diag_10_K29', 'Diag_10_K31', 'Diag_10_K41', 'Diag_10_K56', 'Diag_10_K57', 'Diag_10_K59', 'Diag_10_K66', 'Diag_10_K70', 'Diag_10_K75', 'Diag_10_K76', 'Diag_10_K80', 'Diag_10_K85', 'Diag_10_K91', 'Diag_10_K92', 'Diag_10_L03', 'Diag_10_L89', 'Diag_10_M06', 'Diag_10_M10', 'Diag_10_M15', 'Diag_10_M32', 'Diag_10_M48', 'Diag_10_M60', 'Diag_10_M79', 'Diag_10_M81', 'Diag_10_N05', 'Diag_10_N13', 'Diag_10_N23', 'Diag_10_N28', 'Diag_10_N39', 'Diag_10_N72', 'Diag_10_N92', 'Diag_10_O75', 'Diag_10_Q20', 'Diag_10_R00', 'Diag_10_R06', 'Diag_10_R09', 'Diag_10_R10', 'Diag_10_R19', 'Diag_10_R45', 'Diag_10_R58', 'Diag_10_R63', 'Diag_10_R68', 'Diag_10_R69', 'Diag_10_R78', 'Diag_10_S01', 'Diag_10_S06', 'Diag_10_S12', 'Diag_10_S27', 'Diag_10_T78', 'Diag_10_T81', 'Diag_10_T82', 'Diag_9_V1251', 'Diag_9_V1259', 'Diag_9_V420', 'Diag_9_V422', 'Diag_9_V433', 'Diag_9_V4365', 'Diag_9_V440', 'Diag_9_V441', 'Diag_9_V462', 'Diag_9_V6284', 'Diag_10_Z00', 'Diag_10_Z16', 'Diag_10_Z21', 'Diag_10_Z23', 'Diag_10_Z51', 'Diag_10_Z80', 'Diag_10_Z81', 'Diag_10_Z85', 'Diag_10_Z87', 'Diag_10_Z91', 'Diag_10_Z95', 'Diag_Personal history of antineoplastic chemotherapy', 'Diag_Personal history of pulmonary embolism', 'Diag_Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits', 'Diag_Pneumonia, unspecified organism', 'Diag_10_A419', 'Diag_10_R6521', 'Diag_10_Z794', 'Diag_10_D62']
full_Proc = ['Pro_0BH17EZ', 'Pro_02H633Z', 'Pro_0DJ08ZZ', 'Pro_3E0G76Z', 'Pro_5A1955Z', 'Pro_0W9G3ZX', 'Pro_B211YZZ','Pro_0014', 'Pro_0017', 'Pro_0040', 'Pro_0045', 'Pro_0066', 'Pro_0159', 'Pro_0331', 'Pro_0741', 'Pro_311', 'Pro_3322', 'Pro_3323', 'Pro_3324', 'Pro_3404', 'Pro_3491', 'Pro_3521', 'Pro_3522', 'Pro_3606', 'Pro_3607', 'Pro_3611', 'Pro_3612', 'Pro_3613', 'Pro_3615', 'Pro_3721', 'Pro_3722', 'Pro_3723', 'Pro_3761', 'Pro_3845', 'Pro_387', 'Pro_3891', 'Pro_3893', 'Pro_3895', 'Pro_3897', 'Pro_3899', 'Pro_3950', 'Pro_3961', 'Pro_3995', 'Pro_4311', 'Pro_4443', 'Pro_4513', 'Pro_4516', 'Pro_4523', 'Pro_5185', 'Pro_5459', 'Pro_5491', 'Pro_7309', 'Pro_734', 'Pro_7359', 'Pro_7569', 'Pro_8051', 'Pro_8162', 'Pro_8622', 'Pro_8659', 'Pro_8744', 'Pro_8841', 'Pro_8842', 'Pro_8847', 'Pro_8848', 'Pro_8852', 'Pro_8853', 'Pro_8855', 'Pro_8856', 'Pro_8872', 'Pro_8938', 'Pro_8964', 'Pro_9390', 'Pro_9604', 'Pro_9605', 'Pro_9607', 'Pro_966', 'Pro_9671', 'Pro_9672', 'Pro_9904', 'Pro_9905', 'Pro_9907', 'Pro_9910', 'Pro_9915', 'Pro_9920', 'Pro_9925', 'Pro_9955', 'Pro_9960', 'Pro_9962']

Diag = groups + ['Diag_MDRObeforeICU', 'Diag_ATE_filtered', 'Diag_Acute respiratory failure', 'Diag_Do not resuscitate status', 'Diag_Encounter for palliative care', 'Diag_Hyperosmolality and|or hypernatremia', 'Diag_CONGESTIVE HEART FAILURE', 'Diag_DEMENTIA', 'Diag_RHEUMATIC DISEASE', 'Diag_RENAL DISEASE', 'Diag_MALIGNANT CANCER', 'Diag_METASTATIC SOLID TUMOR', 'Diag_AIDS', 'Diag_CHARLSON COMORBIDITY INDEX', 'Diag_MYOCARDIAL INFARCT', 'Diag_DIABETES WITHOUT CC', 'Diag_DIABETES WITH CC', 'Diag_CHRONIC PULMONARY DISEASE', 'Diag_SEVERE LIVER DISEASE', 'Diag_10_B37', 'Diag_10_D62', 'Diag_10_R00', 'Diag_10_D66', 'Diag_10_T81', 'Diag_MILD LIVER DISEASE', 'Diag_10_G20', 'Diag_10_I10', 'Diag_10_K70', 'Diag_9_V440', 'Diag_10_R6521', 'Diag_10_D50', 'Diag_10_K75', 'Diag_10_A419', 'Diag_10_E43', 'Diag_10_J69', 'Diag_9_99731', 'Diag_10_A40', 'Diag_10_C79', 'Diag_10_K76', 'Diag_10_I82', 'Diag_10_E11', 'Diag_10_E87', 'Diag_10_F05', 'Diag_10_I50', 'Diag_9_E8780', 'Diag_10_J44', 'Diag_10_N28', 'Diag_10_D70', 'Diag_10_I21', 'Diag_10_Z95', 'Diag_10_J93', 'Diag_10_I26', 'Diag_10_G35', 'Diag_9_E9342', 'Diag_9_V422', 'Diag_10_J15', 'Diag_10_E89', 'Diag_10_G97']
Proc = ['Pro_9672', 'Pro_9671', 'Pro_9960', 'Pro_5491', 'Pro_3961', 'Pro_3897', 'Pro_02H633Z', 'Pro_0BH17EZ', 'Pro_3895', 'Pro_0DJ08ZZ', 'Pro_3491', 'Pro_966', 'Pro_5A1955Z', 'Pro_3E0G76Z', 'Pro_3995', 'Pro_8964', 'Pro_3522', 'Pro_3521', 'Pro_0066', 'Pro_3611', 'Pro_3607', 'Pro_3612', 'Pro_3613', 'Pro_3615', 'Pro_9904', 'Pro_9905', 'Pro_9910']

Med = ['Med_Acetaminophen', 'Med_Amiodarone', 'Med_Ampicillin-Sulbactam', 'Med_Aspirin', 'Med_Atorvastatin', 'Med_Azithromycin', 'Med_Calcium Gluconate', 'Med_Cefepime', 'Med_Ceftazidime', 'Med_Ceftriaxone', 'Med_Ciprofloxacin', 'Med_Clindamycin', 'Med_Dexamethasone', 'Med_Dexmedetomidine', 'Med_Diltiazem', 'Med_Famotidine', 'Med_Fentanyl Citrate', 'Med_Folic Acid', 'Med_Furosemide', 'Med_Gabapentin', 'Med_Glucagon', 'Med_Haloperidol', 'Med_Heparin', 'Med_Hydromorphone', 'Med_Insulin', 'Med_Lactulose', 'Med_Levetiracetam', 'Med_Levofloxacin', 'Med_Linezolid', 'Med_Lorazepam', 'Med_Magnesium Sulfate', 'Med_Meropenem', 'Med_Metoclopramide', 'Med_Metoprolol', 'Med_Metronidazole', 'Med_Midazolam', 'Med_Morphine Sulfate', 'Med_Nitroglycerin', 'Med_Norepinephrine', 'Med_Omeprazole', 'Med_Ondansetron', 'Med_Oxycodone', 'Med_Pantoprazole', 'Med_Phenylephrine', 'Med_Piperacillin-Tazobactam', 'Med_Potassium Chloride', 'Med_Prednisone', 'Med_Propofol', 'Med_Ranitidine', 'Med_Simvastatin', 'Med_Sodium Bicarbonate', 'Med_Tacrolimus', 'Med_Thiamine', 'Med_Vancomycin', 'Med_Warfarin']
TS = ['TS_Heart Rate', 'TS_NSBP', 'TS_NDBP', 'TS_NMBP', 'TS_Respiratory_Rate', 'TS_Temperature_C', 'TS_O2 Flow', 'TS_SpO2 Desat Limit', 'TS_Admission_Weight', 'TS_SpO2', 'TS_Pain Level Response', 'TS_Pain', 'TS_Respiratory_Rate_Total', 'TS_Total_PEEP', 'TS_History of falling (within 3 mnths)', 'TS_Calculated Total CO2', 'TS_Base Excess', 'TS_pCO2', 'TS_pH', 'TS_Lactate', 'TS_PaO2', 'TS_Lactic Acid', 'TS_Self ADL', 'TS_ETOH', 'TS_ASBP', 'TS_ADBP', 'TS_MAP', 'TS_Arterial_PaCO2', 'TS_PH (Arterial)', 'TS_Arterial Base Excess', 'TS_Ionized_Calcium', 'TS_Arterial_TCO2', 'TS_Potassium (whole blood)', 'TS_Glucose_WholeBlood', 'TS_PEEP_Set', 'TS_Daily Weight', 'TS_Resp_Spontaneous', 'TS_Peak Insp. Pressure', 'TS_Mean Airway Pressure', 'TS_Visual / hearing deficit', 'TS_Mental status', 'TS_CAM-ICU MS Change', 'TS_Skin Care', 'TS_Back Care', 'TS_Plateau Pressure', 'TS_SaO2', 'TS_FiO2', 'TS_Hemoglobin', 'TS_Hematocrit (serum)', 'TS_WBC', 'TS_Chloride (serum)', 'TS_Creatinine', 'TS_Glucose (serum)', 'TS_Magnesium', 'TS_Sodium (serum)', 'TS_BUN', 'TS_Calcium non-ionized', 'TS_Phosphorous', 'TS_Anion_Gap', 'TS_Potassium (serum)', 'TS_HCO3 (serum)', 'TS_PLT', 'TS_PT', 'TS_PTT', 'TS_INR', 'TS_Hematocrit, Calculated', 'TS_Hematocrit', 'TS_HCO3', 'TS_Creatine Kinase, MB Isoenzyme', 'TS_Phosphate', 'TS_Calcium', 'TS_Chloride', 'TS_Glucose', 'TS_Potassium', 'TS_Sodium', 'TS_Alkaline Phosphatase', 'TS_AST', 'TS_LDH', 'TS_ALT', 'TS_Alkaline Phosphate', 'TS_Creatine_Kinase', 'TS_Bilirubin_total', 'TS_Troponin-T', 'TS_Albumin', 'TS_RDW', 'TS_RBC', 'TS_INR(PT)', 'TS_MCV', 'TS_Basophils', 'TS_Eosinophils', 'TS_Monocytes', 'TS_Neutrophils', 'TS_MCHC', 'TS_MCH', 'TS_Lymphocytes', 'TS_Strength L Arm', 'TS_Strength L Leg', 'TS_Strength R Leg', 'TS_Strength R Arm', 'TS_Differential-Lymphs', 'TS_Potassium_WholeBlood', 'TS_Sodium_WholeBlood', 'TS_Pain_Presence', 'TS_Tidal Volume (observed)', 'TS_Inspiratory Time', 'TS_Specific Gravity', 'TS_Difficulty swallowing', 'TS_Differential-Neuts', 'TS_Fingerstick_Glucose', 'TS_Unintentional weight loss >10 lbs.', 'TS_Oxygen', 'TS_Bed Bath', 'TS_CAM-ICU Inattention', 'TS_PH (Venous)', 'TS_GCS_Total']

label = ['LOS_Hospital','DIEINHOSPITAL','Readmission_30','Multiple_ICUs','sepsis_all', 'FirstICU24_AKI_ALL','ICU_within_12hr_of_admit']

print('Diag:',len(Diag), 'full_Diag:', len(full_Diag),'Proc:',len(Proc),'full_Proc:',len(full_Proc),'Med:',len(Med),'TS:',len(TS),'label:',len(label))


num_v = Med + TS + ['AGE']

cate_v = ['GENDER','ADMISSION_TYPE','FIRST_CAREUNIT'] + Diag + Proc + label


III_IV[Diag + Proc + Med] = III_IV[Diag + Proc + Med].fillna(0)

III_IV[TS] = III_IV[TS].fillna(III_IV[TS].median())

def sclar_coder(rundf,num_v,cate_v):
    scaler = MinMaxScaler()
    rundf[num_v] = scaler.fit_transform(rundf[num_v])
    
    label_encoder = LabelEncoder()
    for col in cate_v:
        rundf[col] = label_encoder.fit_transform(rundf[col])

    return rundf

rundf = sclar_coder(III_IV,num_v,cate_v)
rundf.shape

In [None]:
def runbase(df,thislabel,ft,topf):
    ft = ft + [thislabel]
    print(thislabel,Counter(df[thislabel]),'len(ft):',len(ft))

    Train, Test = train_test_split(df, test_size=0.3, random_state=42)
    Valid, Test = train_test_split(Test, test_size=0.25, random_state=42)
    print(Train.shape,Valid.shape,Test.shape)
    print('Train:',Counter(Train[thislabel]))
    print('Test:',Counter(Test[thislabel]))

    all_AUROC, all_AUPRC, all_scores, all_importances = BaseModel.bootstrap(
       Train[ft], Valid[ft], Test[ft], thislabel, models, n=1000, confidence=0.95
    )

    all_importances['XGB_RF'] = all_importances['XGboost'] + all_importances['Random Forest'] 
    top_xgboost = print_top_features(all_importances, D_ICD_DIAGNOSES, P_ICD_procedures, sort_by='XGboost', top_n=topf)
    top_rf = print_top_features(all_importances, D_ICD_DIAGNOSES, P_ICD_procedures, sort_by='Random Forest', top_n=topf)
    top_XGB_RF = print_top_features(all_importances, D_ICD_DIAGNOSES, P_ICD_procedures, sort_by='XGB_RF', top_n=topf)
    
    print('label',thislabel)
    print('top_xgboost',list(top_xgboost))
    print('top_rf',list(top_rf))
    print('top_XGB_RF',list(top_XGB_RF))
    
    return all_AUROC, all_AUPRC, all_scores, all_importances

In [None]:
ft = basic + Diag + Proc + Med + TS
print(len(ft))

models = [
    ('SVM', {'probability': True}),
    ('LR', {}),
    ('Decision Tree',{}),
    ('Random Forest', {}),
    ('Adaboost',{}),
    ('XGboost', {}),
         ]

In [None]:
print(label)

In [None]:
t_label = 'DIEINHOSPITAL'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],t_label,ft,5)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'Readmission_30'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'Readmission_30',ft,20)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'Multiple_ICUs'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'Multiple_ICUs',ft,5)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'sepsis_all'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'sepsis_all',ft,5)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'FirstICU24_AKI_ALL'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'FirstICU24_AKI_ALL',ft,5)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'LOS_Hospital'
t_f = 'ft'
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'LOS_Hospital',ft,5)
all_scores.style.highlight_max(color='lightgreen', axis=0)

In [None]:
t_label = 'ICU_within_12hr_of_admit'
t_f = 'ft'
ft_icu12 = basic + Diag
print(len(ft_icu12))
all_AUROC, all_AUPRC, all_scores, all_importances = runbase(rundf[rundf.MIMIC == mic],'ICU_within_12hr_of_admit',ft_icu12,15)
all_scores.style.highlight_max(color='lightgreen', axis=0)