# Generate Folds for Nested CV on MIMIC dataset

In [1]:
from sklearn.model_selection import StratifiedKFold
import mimic_pipeline.utils as utils
import pandas as pd

utils.seed_everything()

In [2]:
from tqdm import tqdm
import os

os.makedirs('data/mimic-disease/k-fold-id-no-cmo', exist_ok=True)
for disease in ['akf', 'ami', 'heart_failure', 'sepsis']:
    print(f"{'*'*50} {disease} {'*'*50}")
    entire = pd.read_csv(f"data/mimic-disease/{disease}-union-features-id-excluded-cmo.csv")
    X, y = entire.drop('hospital_expire_flag', axis=1), entire['hospital_expire_flag']
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=utils.SEED)
    for counter, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        X_train, y_train, X_test, y_test = X.iloc[train_idx], y.iloc[train_idx], X.iloc[test_idx], y.iloc[test_idx]
        tmp_train = pd.concat([X_train, y_train], axis=1)
        tmp_test = pd.concat([X_test, y_test], axis=1)
        assert tmp_train.shape[0] + tmp_test.shape[0] == entire.shape[0]
        assert tmp_train.shape[1] == tmp_test.shape[1] == 53
        print(f"{100*len(tmp_train[tmp_train['hospital_expire_flag'] == 1]) / len(tmp_train):.2f}%")
        print(f"{100*len(tmp_test[tmp_test['hospital_expire_flag'] == 1]) / len(tmp_test):.2f}%\n")
        tmp_train.to_csv(f"data/mimic-disease/k-fold-id-no-cmo/TRAIN-{disease}-fold{counter}.csv", index=False)
        tmp_test.to_csv(f"data/mimic-disease/k-fold-id-no-cmo/TEST-{disease}-fold{counter}.csv", index=False)

************************************************** akf **************************************************
17.73%
17.67%

17.71%
17.75%

17.71%
17.75%

17.71%
17.75%

17.73%
17.68%

************************************************** ami **************************************************
8.98%
9.01%

8.98%
9.01%

8.98%
9.01%

8.98%
9.01%

9.01%
8.88%

************************************************** heart_failure **************************************************
10.40%
10.36%

10.38%
10.43%

10.38%
10.43%

10.40%
10.37%

10.40%
10.37%

************************************************** sepsis **************************************************
22.91%
23.00%

22.91%
23.00%

22.94%
22.88%

22.94%
22.88%

22.94%
22.88%

