# Generate Folds for Nested CV on MIMIC dataset

In [None]:
from sklearn.model_selection import StratifiedKFold
import mimic_pipeline.utils as utils
import pandas as pd

utils.seed_everything()
train = pd.read_csv("data/TRAIN-union-features.csv")
test = pd.read_csv("data/TEST-union-features.csv")
entire = pd.concat([train, test], axis=0)

X, y = entire.drop('hospital_expire_flag', axis=1), entire['hospital_expire_flag']
X.head()

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=utils.SEED)
for counter, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, y_train, X_test, y_test = X.iloc[train_idx], y.iloc[train_idx], X.iloc[test_idx], y.iloc[test_idx]
    tmp_train = pd.concat([X_train, y_train], axis=1)
    tmp_test = pd.concat([X_test, y_test], axis=1)
    assert tmp_train.shape[0] + tmp_test.shape[0] == entire.shape[0]
    assert tmp_train.shape[1] == tmp_test.shape[1] == 50
    print(f"{100*len(tmp_train[tmp_train['hospital_expire_flag'] == 1]) / len(tmp_train):.2f}%")
    print(f"{100*len(tmp_test[tmp_test['hospital_expire_flag'] == 1]) / len(tmp_test):.2f}%\n")
    tmp_train.to_csv(f"data/k-fold/TRAIN-union-features-fold{counter}.csv", index=False)
    tmp_test.to_csv(f"data/k-fold/TEST-union-features-fold{counter}.csv", index=False)

## Another version with id included

In [None]:
from sklearn.model_selection import StratifiedKFold
import mimic_pipeline.utils as utils
import pandas as pd

utils.seed_everything()
train = pd.read_csv("data/TRAIN-union-features-id.csv")
test = pd.read_csv("data/TEST-union-features-id.csv")
entire = pd.concat([train, test], axis=0)

X, y = entire.drop('hospital_expire_flag', axis=1), entire['hospital_expire_flag']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=utils.SEED)
for counter, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, y_train, X_test, y_test = X.iloc[train_idx], y.iloc[train_idx], X.iloc[test_idx], y.iloc[test_idx]
    tmp_train = pd.concat([X_train, y_train], axis=1)
    tmp_test = pd.concat([X_test, y_test], axis=1)
    assert tmp_train.shape[0] + tmp_test.shape[0] == entire.shape[0]
    assert tmp_train.shape[1] == tmp_test.shape[1] == 53
    print(f"{100*len(tmp_train[tmp_train['hospital_expire_flag'] == 1]) / len(tmp_train):.2f}%")
    print(f"{100*len(tmp_test[tmp_test['hospital_expire_flag'] == 1]) / len(tmp_test):.2f}%\n")
    tmp_train.to_csv(f"data/k-fold-id/TRAIN-union-features-fold{counter}.csv", index=False)
    tmp_test.to_csv(f"data/k-fold-id/TEST-union-features-fold{counter}.csv", index=False)

test equalness

In [None]:
for i in range(1, 6):
    train_id = pd.read_csv(f"data/k-fold-id/TRAIN-union-features-fold{i}.csv")
    test_id = pd.read_csv(f"data/k-fold-id/TEST-union-features-fold{i}.csv")
    train = pd.read_csv(f"data/k-fold/TRAIN-union-features-fold{i}.csv")
    test = pd.read_csv(f"data/k-fold/TEST-union-features-fold{i}.csv")
    pd.testing.assert_frame_equal(train_id.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1), train)
    pd.testing.assert_frame_equal(test_id.drop(['subject_id', 'hadm_id', 'icustay_id'], axis=1), test)