this notebook contains the pipeline run of the analysis in cardea version 0.0.2

In [None]:
import numpy as np
import pandas as pd

from cardea import Cardea
from cardea.modeling.modeler import Modeler
from cardea.featurization import Featurization
from cardea.data_loader.load_mimic import load_mimic_data

from featuretools.selection import remove_low_information_features
from model_audit import ModelAuditor

## Use Case 1: Kaggle data

In [None]:
cd = Cardea()

cd.load_data_entityset()
cutoff = cd.select_problem('MissedAppointmentProblemDefinition')

feature_matrix = cd.generate_features(cutoff)
feature_matrix = feature_matrix.sample(frac=1)

y = list(feature_matrix.pop('label'))
X = feature_matrix.values

pipelines = [
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.MultinomialNB']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.RandomForestClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'xgboost.XGBClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.neighbors.KNeighborsClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.LogisticRegression']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.SGDClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.GradientBoostingClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.GaussianNB']]
]

problem_type = 'classification'

for pipeline in pipelines:
    hyperparameters = []
    modeler = Modeler()

    pipeline_res = cd.execute_pipeline(np.array(X), np.array(y), pipeline, problem_type)

## Use Case 3: Adaptivity 

In [None]:
# pipelines

pipelines = [
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.MultinomialNB']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.RandomForestClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'xgboost.XGBClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.neighbors.KNeighborsClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.LogisticRegression']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.linear_model.SGDClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.ensemble.GradientBoostingClassifier']],
    [['sklearn.preprocessing.MinMaxScaler', 'sklearn.naive_bayes.GaussianNB']]
]

In [None]:
class Feature(Featurization):
    @staticmethod
    def agg_prim():
        return ["sum", "std", "mode", "mean", "count"]

    @staticmethod
    def trans_prim():
        return ["day", "month", "year", "weekday", "is_weekend"]

    
problems = ['los', 'mortality', 'readmission']
for problem in problems:
    es = load_mimic_data(path='MIMIC/') # data location
    
    if prob == 'mortality':
        label_column = 'hospital_expire_flag'
        time_column = 'admittime'
        secondary_time_column = 'dischtime'
        column_id = 'hadm_id'
        entity = 'admissions'
        remove_columns = ['deathtime', 'discharge_location', 'hospital_expire_flag']

        entity_set_df = es[entity].df
        es = es.entity_from_dataframe(entity_id=entity,
                                      dataframe=entity_set_df,
                                      time_index=time_column,
                                      index=column_id,
                                      secondary_time_index={secondary_time_column: remove_columns})

    elif prob == 'readmission':
        problem = 'readmission'
        label_column = 'readmission'
        time_column = 'dischtime'
        column_id = 'hadm_id'
        entity = 'admissions'

    elif prob == 'los':
        problem = 'los'
        label_column = 'los'
        time_column = 'intime'
        column_id = 'icustay_id'
        entity = 'icustays'
        secondary_time_column = 'outtime'
        remove_columns = ['last_wardid', 'last_careunit', 'los']

        entity_set_df = es[entity].df
        es = es.entity_from_dataframe(entity_id=entity,
                                      dataframe=entity_set_df,
                                      time_index=time_column,
                                      index=column_id,
                                      secondary_time_index={secondary_time_column: remove_columns})
        
    else:
        raise Exception("problem not found.")

    cutoff_times = es[entity].df[[column_id, time_column, label_column]]
    cutoff_times.columns = ['instance_id', 'time', 'label']
    cutoff_times['time'] = cutoff_times['time']
    cutoff_times = cutoff_times.sort_values('time')

    feat = Feature()
    fm_encoded, features_encoded = feat.generate_feature_matrix(es, entity, cutoff_times)

    df = fm_encoded.copy()
    df = df.drop_duplicates()
    
    try:
        df = df.drop([problem], axis=1)
    except:
        pass

    y = df.pop('label')
    X = remove_low_information_features(df)
    
    X = X.fillna(0)
    X = pd.get_dummies(X)

    if problem == 'los':
        y = np.digitize(y, [y.min(), 7, y.max()+1])
    
    y = pd.Categorical(y).codes
    
    problem_type = 'classification'
    scoring_function = 'f1'
    minimize_cost = False

    for pipeline in pipelines:
        hyperparameters = []
        modeler = Modeler()

        pipeline_res = modeler.execute_pipeline(np.array(X), np.array(y), pipeline, problem_type, optimize=False,
                                                minimize_cost=minimize_cost, scoring=scoring_function, max_evals=10)

        for k in pipeline_res.keys():
            hyperparameters.append(pipeline_res[k]['hyperparameter'])

        auditor = ModelAuditor()

        def extract_metric(folds,metric_name):
            metric_output = []
            for fold in folds:
                metric_output.append(fold[metric_name])
            return metric_output

        performance_result = []
        for primitive, hyper in zip(pipeline, hyperparameters):
            report_with_hyper = auditor.generate_pipeline_report(primitive, X, y,
                                                                 problem_type, hyperparameters=hyper)
            
            print('completed tuning performance for {}'.format((primitive[-1])))
            report_no_hyper = auditor.generate_pipeline_report(primitive, X, y, problem_type)
            print('completed non-tuned performance for {}'.format(primitive[-1]))
            print('===============================')

            if problem_type == 'regression':
                #r2_score and mse

                # get non-tuned
                result_arr = []
                result_arr.append('no_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['r2_score'] = np.mean(extract_metric(report_no_hyper['output_result'],'r2_score'))
                metrics['mean_squared_error'] = np.mean(extract_metric(report_no_hyper['output_result'],
                                                                       'mean_squared_error'))
                result_arr.append(metrics)
                performance_result.append(result_arr)

                # get tuned
                result_arr = []
                result_arr.append('with_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['r2_score'] = np.mean(extract_metric(report_with_hyper['output_result'],'r2_score'))
                metrics['mean_squared_error'] = np.mean(extract_metric(report_with_hyper['output_result'],
                                                                       'mean_squared_error'))
                result_arr.append(metrics)
                performance_result.append(result_arr)

            elif problem_type == 'classification':
                #f1_macro and accuracy

                # get non-tuned
                result_arr = []
                result_arr.append('no_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['f1_macro'] = extract_metric(report_no_hyper['output_result'],'f1_macro')
                metrics['accuracy'] = extract_metric(report_no_hyper['output_result'],'accuracy')
                result_arr.append(metrics)
                performance_result.append(result_arr)

                # get tuned
                result_arr = []
                result_arr.append('with_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['f1_macro'] = extract_metric(report_with_hyper['output_result'],'f1_macro')
                metrics['accuracy'] = extract_metric(report_with_hyper['output_result'],'accuracy')
                result_arr.append(metrics)
                performance_result.append(result_arr)


        print(performance_result)

## Using Extract

In [None]:
# load mimic_extract data

import pickle

with open('all_featurized_data.pkl', 'rb') as f:
    all_datasets = pickle.load(f)
    
for problem, df in all_datasets.items():
    df = df.drop_duplicates()

    y = np.array(df.pop('TARGET'))
    X = df

    for pipeline in pipelines:
        hyperparameters = []
        modeler = Modeler()

        pipeline_res = modeler.execute_pipeline(np.array(X), np.array(y), pipeline, problem_type, optimize=False,
                                                minimize_cost=minimize_cost, scoring=scoring_function, max_evals=10)

        for k in pipeline_res.keys():
            hyperparameters.append(pipeline_res[k]['hyperparameter'])

        auditor = ModelAuditor()

        def extract_metric(folds,metric_name):
            metric_output = []
            for fold in folds:
                metric_output.append(fold[metric_name])
            return metric_output

        performance_result = []
        for primitive, hyper in zip(pipeline, hyperparameters):
            report_with_hyper = auditor.generate_pipeline_report(primitive, X, y,
                                                                 problem_type, hyperparameters=hyper)
            
            print('completed tuning performance for {}'.format((primitive[-1])))
            report_no_hyper = auditor.generate_pipeline_report(primitive, X, y, problem_type)
            print('completed non-tuned performance for {}'.format(primitive[-1]))
            print('===============================')

            if problem_type == 'regression':
                #r2_score and mse

                # get non-tuned
                result_arr = []
                result_arr.append('no_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['r2_score'] = np.mean(extract_metric(report_no_hyper['output_result'],'r2_score'))
                metrics['mean_squared_error'] = np.mean(extract_metric(report_no_hyper['output_result'],
                                                                       'mean_squared_error'))
                result_arr.append(metrics)
                performance_result.append(result_arr)

                # get tuned
                result_arr = []
                result_arr.append('with_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['r2_score'] = np.mean(extract_metric(report_with_hyper['output_result'],'r2_score'))
                metrics['mean_squared_error'] = np.mean(extract_metric(report_with_hyper['output_result'],
                                                                       'mean_squared_error'))
                result_arr.append(metrics)
                performance_result.append(result_arr)

            elif problem_type == 'classification':
                #f1_macro and accuracy

                # get non-tuned
                result_arr = []
                result_arr.append('no_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['f1_macro'] = extract_metric(report_no_hyper['output_result'],'f1_macro')
                metrics['accuracy'] = extract_metric(report_no_hyper['output_result'],'accuracy')
                result_arr.append(metrics)
                performance_result.append(result_arr)

                # get tuned
                result_arr = []
                result_arr.append('with_hyper')
                result_arr.append(primitive)
                metrics = {}
                metrics['f1_macro'] = extract_metric(report_with_hyper['output_result'],'f1_macro')
                metrics['accuracy'] = extract_metric(report_with_hyper['output_result'],'accuracy')
                result_arr.append(metrics)
                performance_result.append(result_arr)


        print(performance_result)

## Correlation Test

In [None]:
from scipy.stats.stats import pearsonr
from tqdm import tqdm_notebook as tqdm

# los example correlation
extract = pd.read_csv('features_merge_with_rowid.csv')
cardea = [pd.read_csv('fm/los/fm%s.csv' % i) for i in range(0, 50)]
cardea = pd.concat(cardea).set_index('hadm_id', drop=True)

# match cohort
mimic_extract = extract[extract['ROW_ID'].isin(cardea['row_id'])].drop_duplicates(['ROW_ID']).fillna(0)
mimic_cardea = cardea[cardea['row_id'].isin(extract['ROW_ID'])].fillna(0)

# sort
mimic_extract.sort_values('ROW_ID', inplace=True)
mimic_cardea.sort_values('row_id', inplace=True)

# show consistency
print('Extract', mimic_extract.shape)
print('Cardea', mimic_cardea.shape)

# convert to binary
mimic_extract = pd.get_dummies(mimic_extract)
mimic_cardea = pd.get_dummies(mimic_cardea)

all_corr = []
all_labels = []

for col in tqdm(mimic_extract.columns):
    corr = []
    labels = []
    for car in mimic_cardea.columns:
        x = np.array(mimic_extract[col])
        y = np.array(mimic_cardea[car])
        
        try:
            pear = pearsonr(x, y)[0]
            if not math.isnan(pear):
                corr.append(pear)
        except:
            pass
        
    if len(corr) > 0:
        idx = np.nanargmax(np.absolute(corr))
        all_corr.append(corr[idx])
        
np.median(np.absolute(all_corr))