In [None]:
import numpy as np
import pandas as pd
import os
import pickle 
import logging
from datetime import datetime

from featuretools.selection import remove_low_information_features
from cardea.benchmark import benchmark, aggregate_results_by_pipeline, aggregate_results_by_problem, CLASSIFICATION_METRICS

Path of the dataset files. Should be removed when the S3 source is created.

In [None]:
fm_dir = 'path/to/FeatureMatrices'

## Pipelines

In [None]:
# pipelines

pipelines = {
#     'Logistic Regression': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.LogisticRegression'],
    'K-Nearest Neightbors': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.neighbors.KNeighborsClassifier'],
    'Random Forest': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.RandomForestClassifier'],
#     'Gaussian Naive Bayes': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.GaussianNB'],
    'Multinomial Naive Bayes': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.MultinomialNB'],
    'XGB': ['sklearn.preprocessing.MinMaxScaler', 'xgboost.XGBClassifier'],
#     'Stochastic Gradient Descent': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.SGDClassifier'],
    'Gradient Boosting': ['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.GradientBoostingClassifier']
}

In [None]:
# datasets

def load_feature_matrix(path):
    df = pd.read_csv(path)
    y = df.pop('label')
    X = remove_low_information_features(df)
    
    col_num = len(X.columns)
    X = X.fillna(0)
    X = pd.get_dummies(X)
    print("#features before one-hot-encoding: {}, #features after one-hot-encoding: {}".format(col_num, len(X.columns)))
    return X.join(y)

def load_feature_tool_feature_matrix(problem):
    path = os.path.join(fm_dir, "fm_ft", "{}.csv".format(problem))
    return load_feature_matrix(path)

def load_mimix_extract_feature_matrix(problem):
    path = os.path.join(fm_dir, "fm_me", "{}.csv".format(problem))
    return load_feature_matrix(path)

## MIMIC-Extract Datasets + Cardea AutoML

In [None]:
problems = ['los', 'mortality', 'readmission']
datasets = {p: load_mimix_extract_feature_matrix(p) for p in problems}
    
# sample small datasets for quick testing
sample_datasets = {k: v.sample(n=1000, random_state=1) for k, v in datasets.items()}

### Details in Each Execution of Pipelines

In [None]:
results = benchmark(pipelines, sample_datasets, sample_datasets.keys(), target_name='label', 
                    optimize=False, runs=1, from_fm=True)
results

### Summary of Each Pipeline in Each Problem-solving

In [None]:
pipeline_summary = aggregate_results_by_pipeline(results, 'F1 Macro')
pipeline_summary

### Summary of Each Problem-solving

In [None]:
problem_summary = aggregate_results_by_problem(results, 'F1 Macro')
problem_summary

## FeatureTool + Cardea AutoML

In [None]:
problems = ['los', 'mortality', 'readmission']
datasets = {p: load_feature_tool_feature_matrix(p) for p in problems}
    
# sample small datasets for quick testing
sample_datasets = {k: v.sample(n=1000, random_state=1) for k, v in datasets.items()}

The feature numbers after one-hot-encoding are too large!