In [41]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from cache_em_all import Cachable

import os

In [4]:
DATA_DIR = "../data/raw/training" # Path to the data

# Names of all columns in the data that contain physiological data
physiological_cols = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets']

# Names of all columns in the data that contain demographic data
demographic_cols = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

# The combination of physiological and demographic data is what we will use as features in our model
feature_cols = physiological_cols + demographic_cols

# The name of the column that contains the value we are trying to predic
label_col = "SepsisLabel"

# Pre-calculated means and standard deviation of all physiological and demographic columns. We will use this to normalize
# data using their z-score. This isn't as important for simpler models such as random forests and decision trees,
# but can result in significant improvements when using neural networks
physiological_mean = np.array([
        83.8996, 97.0520,  36.8055,  126.2240, 86.2907,
        66.2070, 18.7280,  33.7373,  -3.1923,  22.5352,
        0.4597,  7.3889,   39.5049,  96.8883,  103.4265,
        22.4952, 87.5214,  7.7210,   106.1982, 1.5961,
        0.6943,  131.5327, 2.0262,   2.0509,   3.5130,
        4.0541,  1.3423,   5.2734,   32.1134,  10.5383,
        38.9974, 10.5585,  286.5404, 198.6777])
physiological_std = np.array([
        17.6494, 3.0163,  0.6895,   24.2988, 16.6459,
        14.0771, 4.7035,  11.0158,  3.7845,  3.1567,
        6.2684,  0.0710,  9.1087,   3.3971,  430.3638,
        19.0690, 81.7152, 2.3992,   4.9761,  2.0648,
        1.9926,  45.4816, 1.6008,   0.3793,  1.3092,
        0.5844,  2.5511,  20.4142,  6.4362,  2.2302,
        29.8928, 7.0606,  137.3886, 96.8997])
demographic_mean = np.array([60.8711, 0.5435, 0.0615, 0.0727, -59.6769, 28.4551])
demographic_std = np.array([16.1887, 0.4981, 0.7968, 0.8029, 160.8846, 29.5367])

@Cachable("flattened.csv")
def flatten(in_df, hours=4):
    res = []

    new_cols = []
    for i in range(hours):
        new_cols.append([c + "_" + str(i) for c in feature_cols])


    df = in_df.sort_values("hours")
    for patient, _df in df.groupby("patient"):
        n = int(len(_df) / hours)

        for i in range(n):
            window = _df.iloc[i*hours:(i+1)*hours]
            window_dict = {}

            for j in range(hours):
                for c in physiological_cols:
                    window_dict[c + "_" + str(j)] = window[c].iloc[j]

            for c in demographic_cols:
                window_dict[c] = window[c].iloc[0]

            window_dict[label_col] = window[label_col].mean()
            window_dict["patient"] = patient

            res.append(window_dict)

    res = pd.DataFrame(res)

    res = res[res[label_col] <= 1 / hours]
    res[label_col] = res[label_col].apply(lambda x: 1 if x else 0)

    return res

In [5]:
def load_single_file(file_path):
    df = pd.read_csv(file_path, sep='|')
    df['hours'] = df.index
    df['patient'] = file_path[22:-4]
    return df
    
    

In [6]:
def get_data_files():
    return [os.path.join(DATA_DIR, x) for x in sorted(os.listdir(DATA_DIR)) if int(x[1:-4]) % 5 > 0]

def clean_data(data):
    data.reset_index(inplace=True, drop=True)

    # Normalizes physiological and demographic data using z-score.
    data[physiological_cols] = (data[physiological_cols] - physiological_mean) / physiological_std
    data[demographic_cols] = (data[demographic_cols] - demographic_mean) / demographic_std

    # Maps invalid numbers (NaN, inf, -inf) to numbers (0, really large number, really small number)
    data[feature_cols] = np.nan_to_num(data[feature_cols])

    return data


@Cachable("data.csv")
def load_data():
    data = get_data_files()
    data_frames = [clean_data(load_single_file(d)) for d in data]
    merged = pd.concat(data_frames)
    return merged
        
    


In [7]:
def evaluate(actual, predicted, prefix=""):
    precision = precision_score(actual, predicted)
    recall = recall_score(actual, predicted)
    accuracy = accuracy_score(actual, predicted)

    print("%s Precision: %.3f%%, Recall: %.3f%%, Accuracy: %.3f%%" % (prefix, precision * 100, recall * 100, accuracy * 100))

def train_simple(data, feature_cols, label_col):
    train_df, test_df = train_test_split(data, test_size=0.2)
    train_X = train_df[feature_cols]
    train_y = train_df[label_col]
    test_X = test_df[feature_cols]
    test_y = test_df[label_col]
    clf = RandomForestClassifier()
    clf.fit(train_X,train_y)
    y_pred_train = clf.predict(train_X)
    y_pred_test = clf.predict(test_X)
    evaluate(train_y,y_pred_train,'train')
    evaluate(test_y,y_pred_test,'test')
    

In [8]:
# def train_stratified(data, feature_cols, label_col, stratify_col):
#     X = data[feature_cols]
#     y = data[label_col]
#     group = data[stratify_col]

#     train_pred = []
#     train_actual = []

#     test_pred = []
#     test_actual = []

#     kf = GroupKFold(n_splits=5)
#     for train_idx, test_idx in kf.split(X, y, group):
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

#         clf = MLPClassifier()
#         clf.fit(X_train, y_train)

#         train_pred.extend(clf.predict(X_train))
#         train_actual.extend(y_train)

#         test_pred.extend(clf.predict(X_test))
#         test_actual.extend(y_test)



#     evaluate(train_actual, train_pred, "Train")
#     evaluate(test_actual, test_pred, "Test")

In [22]:
def train_stratified(data, feature_cols, label_col, stratify_col):
    X = data[feature_cols]
    y = data[label_col]
    group = data[stratify_col]

    train_pred = []
    train_actual = []

    test_pred = []
    test_actual = []

    kf = StratifiedKFold(n_splits=2)
    for train_idx, test_idx in kf.split(X, y, group):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        return X_train, X_test, y_train, y_test
#         clf = RandomForestClassfier()
#         clf.fit(X_train, y_train)
#         train_pred.extend(clf.predict(X_train))
#         train_actual.extend(y_train)

#         test_pred.extend(clf.predict(X_test))
#         test_actual.extend(y_test)



#     evaluate(train_actual, train_pred, "Train")
#     evaluate(test_actual, test_pred, "Test")

In [7]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
groups = np.array([0, 0, 2, 2])
skf.get_n_splits(X, y,groups)

print(skf)

for train_index, test_index in skf.split(X, y):
   print("TRAIN:", train_index, "TEST:", test_index)
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]


In [8]:
df = load_data()
df.columns

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'hours', 'patient'],
      dtype='object')

# Decision Tree

In [61]:
train_simple(df,feature_cols,label_col)

train Precision: 99.982%, Recall: 100.000%, Accuracy: 100.000%
test Precision: 34.469%, Recall: 32.565%, Accuracy: 97.200%


# Random Forest n_estimators=10

In [63]:
train_simple(df,feature_cols,label_col)



train Precision: 99.903%, Recall: 83.526%, Accuracy: 99.637%
test Precision: 87.500%, Recall: 4.345%, Accuracy: 97.911%


# Random Forest n_estimators=100

In [65]:
train_simple(df,feature_cols,label_col)

train Precision: 99.973%, Recall: 99.936%, Accuracy: 99.998%
test Precision: 90.977%, Recall: 4.357%, Accuracy: 97.887%


# MLPClassifier 

In [71]:
train_simple(df,feature_cols,label_col)

train Precision: 67.600%, Recall: 5.011%, Accuracy: 97.864%
test Precision: 33.862%, Recall: 2.350%, Accuracy: 97.795%


In [None]:
train_simple(df,feature_cols,label_col)

# Stratified train test split

In [51]:
train_stratified(df, feature_cols, label_col, 'patient')

Train Precision: 78.186%, Recall: 5.702%, Accuracy: 97.904%
Test Precision: 11.822%, Recall: 1.848%, Accuracy: 97.553%


# Flattened CSV

In [30]:
flat = flatten(df)

In [31]:
flattened_feat_cols = [x for x in flat.columns if x not in [label_col,'patient']]

In [38]:
def train_stratified_balanced(data, feature_cols, label_col):
    
    labels = data.groupby(['patient']).agg({'SepsisLabel':'sum'})['SepsisLabel']
    positive_index = labels.loc[(labels == 1)].index
    size = int(len(positive_index)/2)
    half_positive_index = np.random.choice(positive_index, size, replace=False)
    X_train_full = data[~data['patient'].isin(half_positive_index)]
    train_labels = X_train_full.groupby('patient').agg({'SepsisLabel':'sum'})['SepsisLabel']
    negative_index = train_labels.loc[(train_labels == 0)].index
    size = int(len(negative_index)*0.2)
    small_negative_index = np.random.choice(negative_index, size, replace=False)
    train_df = X_train_full[~X_train_full['patient'].isin(small_negative_index)]
    test_df = data.loc[(data['patient'].isin(half_positive_index)) | (data['patient'].isin(small_negative_index))]
    
    
    train_X = train_df[feature_cols]
    train_y = train_df[label_col]
    test_X = test_df[feature_cols]
    test_y = test_df[label_col]
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(train_X,train_y)
    y_pred_train = clf.predict(train_X)
    y_pred_test = clf.predict(test_X)
    evaluate(train_y,y_pred_train,'train')
    evaluate(test_y,y_pred_test,'test')

In [39]:
train_stratified_balanced(df,feature_cols,label_col)

train Precision: 99.985%, Recall: 99.942%, Accuracy: 99.998%
test Precision: 0.000%, Recall: 0.000%, Accuracy: 99.993%


  'recall', 'true', average, warn_for)


In [27]:
test_y.value_counts()

0    108884
Name: SepsisLabel, dtype: int64

In [24]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [40]:
train_stratified_balanced(flat,flattened_feat_cols,label_col)

train Precision: 100.000%, Recall: 100.000%, Accuracy: 100.000%
test Precision: 0.000%, Recall: 0.000%, Accuracy: 99.449%


  'precision', 'predicted', average, warn_for)
