In [9]:
import pandas as pd
import numpy as np
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv('../dataset/train_cleaned.csv', sep='|')
val_df = pd.read_csv('../dataset/val_cleaned.csv', sep='|')
test_df = pd.read_csv('../dataset/test_cleaned.csv', sep='|')

In [3]:
train_df.columns

Index(['patient_id', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'BUN', 'Chloride',
       'Glucose', 'Magnesium', 'Potassium', 'Hct', 'Hgb', 'WBC', 'Age',
       'Gender', 'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')

In [7]:
cfr = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier())
X_train = train_df.drop(columns=['patient_id', 'SepsisLabel'])
y_train = train_df.SepsisLabel
X_val = val_df.drop(columns=['patient_id', 'SepsisLabel'])
y_val = val_df.SepsisLabel
cfr.fit(X_train, y_train)
cfr.score(X_train, y_train)

0.9081423906394365

In [5]:
y_pred = cfr.predict(X_val)
f1_score(y_val, y_pred)

0.13344220226291845

In [18]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
scores = cross_val_score(cfr, X_val, y_val, scoring='roc_auc',\
                        cv=cv, n_jobs=-1)

In [19]:
print('ROC AUC: %.3f' % np.mean(scores))

ROC AUC: 0.919
