In [1]:
import numpy as np
import pandas as pd
import os
import pickle 
import logging
from datetime import datetime

from featuretools.selection import remove_low_information_features
from cardea.benchmark import benchmark, aggregate_results_by_pipeline, aggregate_results_by_problem, CLASSIFICATION_METRICS

Path of the dataset files. Should be removed when the S3 source is created.

In [2]:
fm_dir = 'path/to/FeatureMatrices'

## Pipelines

In [3]:
# pipelines

pipelines = {
#     'Logistic Regression': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.LogisticRegression'],
    'K-Nearest Neightbors': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.neighbors.KNeighborsClassifier'],
    'Random Forest': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.RandomForestClassifier'],
#     'Gaussian Naive Bayes': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.GaussianNB'],
    'Multinomial Naive Bayes': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.MultinomialNB'],
    'XGB': ['sklearn.preprocessing.MinMaxScaler', 'xgboost.XGBClassifier'],
#     'Stochastic Gradient Descent': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.SGDClassifier'],
    'Gradient Boosting': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.GradientBoostingClassifier']
}

In [4]:
# datasets

def load_feature_matrix(path):
    df = pd.read_csv(path)
    y = df.pop('label')
    X = remove_low_information_features(df)
    
    col_num = len(X.columns)
    X = X.fillna(0)
    X = pd.get_dummies(X)
    print("#features before one-hot-encoding: {}, #features after one-hot-encoding: {}".format(col_num, len(X.columns)))
    return X.join(y)

def load_feature_tool_feature_matrix(problem):
    path = os.path.join(fm_dir, "fm_ft", "{}.csv".format(problem))
    return load_feature_matrix(path)

def load_mimix_extract_feature_matrix(problem):
    path = os.path.join(fm_dir, "fm_me", "{}.csv".format(problem))
    return load_feature_matrix(path)

## MIMIC-Extract Datasets + Cardea AutoML

In [5]:
problems = ['los', 'mortality', 'readmission']
datasets = {p: load_mimix_extract_feature_matrix(p) for p in problems}
    
# sample small datasets for quick testing
sample_datasets = {k: v.sample(n=1000, random_state=1) for k, v in datasets.items()}

#features before one-hot-encoding: 317, #features after one-hot-encoding: 362
#features before one-hot-encoding: 317, #features after one-hot-encoding: 362
#features before one-hot-encoding: 317, #features after one-hot-encoding: 362


### Details in Each Execution of Pipelines

In [6]:
results = benchmark(pipelines, sample_datasets, sample_datasets.keys(), target_name='label', 
                    optimize=False, runs=1, from_fm=True)
results

Using TensorFlow backend.
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Accuracy,Confusion Matrix,Elapsed Time(s),F1 Macro,Pipeline,Precision,Problem,Recall,Status,Tuned
0,0.928,"[[92.8, 0.3], [6.9, 0.0]]",5.904974,0.481043,K-Nearest Neightbors,0.465429,los,0.498426,OK,False
1,0.921,"[[92.1, 1.0], [6.9, 0.0]]",9.42863,0.479352,Random Forest,0.465184,los,0.494668,OK,False
2,0.878,"[[86.6, 6.5], [5.7, 1.2]]",0.470823,0.548238,Multinomial Naive Bayes,0.551549,los,0.55115,OK,False
3,0.928,"[[92.8, 0.3], [6.9, 0.0]]",13.666718,0.481241,XGB,0.465409,los,0.498403,OK,False
4,0.93,"[[93.0, 0.1], [6.9, 0.0]]",2.915265,0.481779,Gradient Boosting,0.465465,los,0.499462,OK,False
5,0.897,"[[89.1, 0.8], [9.5, 0.6]]",1.616121,0.515495,K-Nearest Neightbors,0.668504,mortality,0.520214,OK,False
6,0.888,"[[88.2, 1.7], [9.5, 0.6]]",9.026014,0.514588,Random Forest,0.582243,mortality,0.519717,OK,False
7,0.881,"[[84.9, 5.0], [6.9, 3.2]]",0.47643,0.640843,Multinomial Naive Bayes,0.659222,mortality,0.633122,OK,False
8,0.898,"[[88.8, 1.1], [9.1, 1.0]]",13.931663,0.558977,XGB,0.711858,mortality,0.552298,OK,False
9,0.896,"[[89.4, 0.5], [9.9, 0.2]]",2.905289,0.490248,Gradient Boosting,0.500194,mortality,0.508246,OK,False


### Summary of Each Pipeline in Each Problem-solving

In [7]:
pipeline_summary = aggregate_results_by_pipeline(results, 'F1 Macro')
pipeline_summary

Unnamed: 0_level_0,Gradient Boosting,Gradient Boosting,Gradient Boosting,K-Nearest Neightbors,K-Nearest Neightbors,K-Nearest Neightbors,Multinomial Naive Bayes,Multinomial Naive Bayes,Multinomial Naive Bayes,Random Forest,Random Forest,Random Forest,XGB,XGB,XGB
Unnamed: 0_level_1,Average Elapsed Time(s),Average F1 Macro,Best F1 Macro,Average Elapsed Time(s),Average F1 Macro,Best F1 Macro,Average Elapsed Time(s),Average F1 Macro,Best F1 Macro,Average Elapsed Time(s),Average F1 Macro,Best F1 Macro,Average Elapsed Time(s),Average F1 Macro,Best F1 Macro
los,2.915265,0.481779,0.481779,5.904974,0.481043,0.481043,0.470823,0.548238,0.548238,9.42863,0.479352,0.479352,13.666718,0.481241,0.481241
mortality,2.905289,0.490248,0.490248,1.616121,0.515495,0.515495,0.47643,0.640843,0.640843,9.026014,0.514588,0.514588,13.931663,0.558977,0.558977
readmission,2.929179,0.510225,0.510225,1.599109,0.485286,0.485286,0.465431,0.551713,0.551713,9.90133,0.48365,0.48365,13.543952,0.484737,0.484737


### Summary of Each Problem-solving

In [8]:
problem_summary = aggregate_results_by_problem(results, 'F1 Macro')
problem_summary

Unnamed: 0,Average F1 Macro,Best F1 Macro,Best Pipeline
los,0.494331,0.548238,Multinomial Naive Bayes
mortality,0.54403,0.640843,Multinomial Naive Bayes
readmission,0.503122,0.551713,Multinomial Naive Bayes


## FeatureTool + Cardea AutoML

In [None]:
problems = ['los', 'mortality', 'readmission']
datasets = {p: load_feature_tool_feature_matrix(p) for p in problems}
    
# sample small datasets for quick testing
sample_datasets = {k: v.sample(n=1000, random_state=1) for k, v in datasets.items()}

  


#features before one-hot-encoding: 196, #features after one-hot-encoding: 36721


  


#features before one-hot-encoding: 470, #features after one-hot-encoding: 32015


  


The feature numbers after one-hot-encoding are too large!