In [51]:
import os
import tempfile
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import sklearn_relief as relief

Load Data

In [52]:
with open('data/train_valid_test_data.pickle', 'rb') as f:
    train_df, valid_df, test_df = pickle.load(f)

''' 
Drop features that have high percantage of missing values and 
have no noticable distribution differences.
'''

train_df.drop(train_df.columns[train_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
valid_df.drop(valid_df.columns[valid_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
test_df.drop(test_df.columns[test_df.columns.str.contains('unnamed',case = False)],   axis = 1, inplace = True)

drop_cols = ["FiO2", "Bilirubin_direct", "SBP", "DBP", "Hct", "HospAdmTime", "ICULOS"] 
train_df.drop(drop_cols, axis=1, inplace=True)
valid_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

Split Data

In [53]:
def split_features_and_labels(df):
    cols = df.columns
    return np.array(df[cols[:-1]].values), df[cols[-1]].values

In [54]:
X_train, y_train = split_features_and_labels(train_df)
X_valid, y_valid = split_features_and_labels(valid_df)
X_test, y_test   = split_features_and_labels(test_df)

Scale data

In [55]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
X_test  = scaler.transform(X_test)

X_train = np.clip(X_train, -5, 5)
X_valid = np.clip(X_valid, -5, 5)
X_test  = np.clip(X_test, -5, 5)

Perform Relief feature selection

In [56]:
r = relief.Relief( n_features = 15) 
X_train_relief = r.fit_transform(X_train, y_train)

In [57]:
feature_weights = []

print("Feature weights")
for i in range(0, len(r.w_)):
    #print(train_df.columns[i] + " = " + str(feature_weights[i]))
    feature_weights.append((train_df.columns[i], str(r.w_[i])))

print(feature_weights)

Feature weights
[('HR', '0.618823691869244'), ('O2Sat', '0.34247804020675554'), ('Temp', '0.20328382554129562'), ('MAP', '0.4974743001602473'), ('Resp', '0.5096737690542261'), ('BaseExcess', '0.37214005904284553'), ('HCO3', '0.546790981076548'), ('pH', '0.03171453972884198'), ('PaCO2', '0.19217735742662018'), ('SaO2', '0.11231232313758487'), ('AST', '0.08766250567498025'), ('BUN', '0.4713366953716567'), ('Alkalinephos', '0.26603499617196874'), ('Calcium', '0.21674258891738896'), ('Chloride', '0.34138575520552805'), ('Creatinine', '0.2697789459547937'), ('Glucose', '0.5673901203032414'), ('Lactate', '0.36436933782277026'), ('Magnesium', '0.4341617166529501'), ('Phosphate', '0.3884699622663345'), ('Potassium', '0.5065779543958009'), ('Bilirubin_total', '0.12293902115160801'), ('TroponinI', '0.0239126886965358'), ('Hgb', '0.5391547673585205'), ('PTT', '0.5710638227489825'), ('WBC', '0.5446897769016461'), ('Fibrinogen', '0.1768557560986996'), ('Platelets', '0.6084466747542652'), ('Age', '0

In [65]:
FEATURE_NUMBER = 15
feature_weights.sort(key = lambda x: x[1] ,reverse = True)
print("Selected Features")
print(feature_weights[0 : FEATURE_NUMBER] )

Selected Features
[('HR', '0.618823691869244'), ('Platelets', '0.6084466747542652'), ('PTT', '0.5710638227489825'), ('Glucose', '0.5673901203032414'), ('HCO3', '0.546790981076548'), ('WBC', '0.5446897769016461'), ('Hgb', '0.5391547673585205'), ('Age', '0.5262768211538635'), ('Resp', '0.5096737690542261'), ('Potassium', '0.5065779543958009'), ('MAP', '0.4974743001602473'), ('BUN', '0.4713366953716567'), ('Magnesium', '0.4341617166529501'), ('Phosphate', '0.3884699622663345'), ('BaseExcess', '0.37214005904284553')]


In [64]:
selected_features = [ feature for feature, _ in feature_weights[0 : FEATURE_NUMBER]]
selected_features.append("SepsisLabel")
print("Selected Features")
print(selected_features)

Selected Features
['HR', 'Platelets', 'PTT', 'Glucose', 'HCO3', 'WBC', 'Hgb', 'Age', 'Resp', 'Potassium', 'MAP', 'BUN', 'Magnesium', 'Phosphate', 'BaseExcess', 'SepsisLabel']
