## AutoML Process
###  1) Import Libraries and Read data
###  2) Define target and determine numeric, categorical, and text
###  3) Define partitioning method and CV method
###  4) Run Competition
###  5) Save best model as .pkl file


## Let's begin!

###  1) Import Libraries and Read data

In [1]:
from copy import copy

import numpy as np
from numpy import inf
import pandas as pd
from datetime import datetime
import functools
import matplotlib.pyplot as plt  

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  #RF and GBM algorithm
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn import preprocessing, neighbors, metrics
import sklearn
if sklearn.__version__<'0.20':
    from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold, PredefinedSplit
else:
    from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, PredefinedSplit

import scipy.stats as st
from sklearn.metrics import mean_absolute_error, accuracy_score, log_loss, make_scorer, auc, roc_auc_score


%matplotlib inline

In [2]:
sklearn.__version__

'1.0.1'

In [3]:
import automl
from automl import automl_utils
from automl import StackLayer, TextElasticNetBinary, MissingDataHandler

In [4]:
from pipelines import gbm_classifier_pipeline, rf_classifier_pipeline, all_tree_classifier_pipeline

In [5]:
# import importlib
# importlib.reload(automl)

In [6]:
input_data=pd.read_csv("diabetes_data.csv")
input_data.sample(5)

Unnamed: 0,rowID,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
9247,9248,Caucasian,Female,[50-60),?,Urgent,Discharged/transferred to home with home healt...,Transfer from a hospital,1,?,...,No,No,No,No,No,Yes,True,Cerebral thrombosis without mention of cerebra...,Basilar artery syndrome,"Diabetes with neurological manifestations, typ..."
293,294,Caucasian,Male,[60-70),?,Emergency,Discharged to home,Physician Referral,3,CP,...,No,No,No,No,No,Yes,False,"Pneumonia, organism unspecified",Hyperosmolality and/or hypernatremia,Diabetes mellitus without mention of complicat...
2905,2906,Caucasian,Female,[50-60),?,Urgent,Discharged to home,Emergency Room,1,UN,...,No,No,No,No,No,No,True,Hyperosmolality and/or hypernatremia,Secondary malignant neoplasm of kidney,Alteration of consciousness
4328,4329,Caucasian,Male,[70-80),?,Elective,,Transfer from a hospital,6,?,...,No,No,No,No,No,Yes,False,Coronary atherosclerosis of unspecified type o...,Postmyocardial infarction syndrome,Coronary atherosclerosis of unspecified type o...
368,369,Caucasian,Female,[70-80),?,Emergency,,Emergency Room,11,?,...,No,No,No,No,Ch,Yes,True,Acute myocardial infarction of anterolateral w...,"Congestive heart failure, unspecified",Nonspecific findings on examination of blood


###  2) Define target and determine numeric, categorical, and text

In [7]:
target="readmitted"

In [8]:
input_info=automl_utils.create_feature_metadata(input_data)
input_info

Unnamed: 0,feature_name,count,unique,top,freq,mean,std,min,25%,50%,75%,max,num_unique,feature_type,missing_count
0,rowID,10000.0,,,,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0,10000,Numeric,0
1,race,10000.0,6.0,Caucasian,7359.0,,,,,,,,6,Categorical,0
2,gender,10000.0,2.0,Female,5398.0,,,,,,,,2,Categorical,0
3,age,10000.0,10.0,[70-80),2595.0,,,,,,,,10,Categorical,0
4,weight,10000.0,8.0,?,9592.0,,,,,,,,8,Categorical,0
5,admission_type_id,9279.0,6.0,Emergency,4905.0,,,,,,,,6,Categorical,721
6,discharge_disposition_id,9531.0,21.0,Discharged to home,6056.0,,,,,,,,21,Categorical,469
7,admission_source_id,9064.0,10.0,Emergency Room,4940.0,,,,,,,,10,Categorical,936
8,time_in_hospital,10000.0,,,,4.4347,3.021597,1.0,2.0,4.0,6.0,14.0,14,,0
9,payer_code,10000.0,16.0,?,5341.0,,,,,,,,16,Categorical,0


In [9]:
input_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
time_in_hospital,10000.0,4.4347,3.021597,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,10000.0,43.0786,19.453315,1.0,32.0,44.0,57.0,120.0
num_procedures,10000.0,1.3992,1.706438,0.0,0.0,1.0,2.0,6.0
num_medications,10000.0,15.5638,8.391613,1.0,10.0,14.0,19.0,81.0
number_outpatient,10000.0,0.2817,1.119406,0.0,0.0,0.0,0.0,36.0
number_emergency,10000.0,0.115,0.649475,0.0,0.0,0.0,0.0,42.0
number_inpatient,10000.0,0.3873,0.854267,0.0,0.0,0.0,0.0,10.0
number_diagnoses,10000.0,7.0253,2.020957,1.0,5.0,7.0,9.0,9.0


In [10]:
target='readmitted'
feature_list=['number_inpatient','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency',
              'discharge_disposition_id','medical_specialty',"number_diagnoses","num_lab_procedures","admission_source_id",
              'age', 'diag_1_desc','diag_2_desc',"diag_3_desc",'admission_type_id']#'diag_1',"diag_2","diag_3"
num_features=['number_inpatient', 'number_diagnoses','num_lab_procedures','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency']
text_features=list(set(feature_list)^set(num_features))
text_features

['diag_1_desc',
 'discharge_disposition_id',
 'admission_source_id',
 'diag_3_desc',
 'diag_2_desc',
 'age',
 'admission_type_id',
 'medical_specialty']

In [11]:
sklearn.__version__
# from keras.wrappers.scikit_learn import KerasRegressor

'1.0.1'

In [12]:
X=input_data[feature_list]
y=input_data[target].astype(int)

###  3) Define partitioning method and CV method

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=1234)

In [14]:
#split the data into 5 time cross validation folds
ps = PredefinedSplit(test_fold=np.floor(5*np.arange(len(X_train))/len(X_train)))
ps.unique_folds

array([0, 1, 2, 3, 4])

###  4) Run Competition

In [15]:
# gbm_model = gbm_classifier_pipeline(X_train,y_train,num_features,text_features,cv=ps,verbose=1)

In [16]:
# rf_model = rf_classifier_pipeline(X_train,y_train,num_features,text_features,cv=ps,verbose=1)

In [17]:
tree_model_list = all_tree_classifier_pipeline(X_train,y_train,num_features,text_features,scoring='neg_log_loss',cv=ps,verbose=1)

diag_1_desc
discharge_disposition_id
admission_source_id
diag_3_desc
diag_2_desc
age
admission_type_id
medical_specialty
Fitting 5 folds for each of 4 candidates, totalling 20 fits




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.8s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.4s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.0s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.8s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.3s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.9s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.7s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.7s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.7s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.1s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.7s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.7s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   8.6s
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.3s
Best parameter (CV score=-0.632):
{'gbm__max_depth': 3, 'gbm__n_estimators': 100}
Fitting 5 folds for each of 12 candidates, totalling 60 fits




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.9s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.6s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.6s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.9s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.5s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.6s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.9s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.8s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.9s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.6s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.8s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.1s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   5.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   6.7s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s
[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total=   8.5s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.4s
Best parameter (CV score=-0.634):
{'rf__max_leaf_nodes': 64, 'rf__n_estimators': 300}


In [18]:
gbm_model=tree_model_list[0]
rf_model=tree_model_list[1]

In [19]:
rf_model.feature_names_in_

array(['number_inpatient', 'number_diagnoses', 'num_lab_procedures',
       'num_medications', 'time_in_hospital', 'num_procedures',
       'number_outpatient', 'number_emergency', 'diag_1_desc',
       'discharge_disposition_id', 'admission_source_id', 'diag_3_desc',
       'diag_2_desc', 'age', 'admission_type_id', 'medical_specialty'],
      dtype=object)

In [24]:
best_model=gbm_model

In [25]:
final_layer=best_model[-1]

In [26]:
#get test set text predictions
test_pred=best_model.predict_proba(X_test[list(best_model.feature_names_in_)])

In [27]:
roc_auc_score(y_test,test_pred[:,1])

0.6704439970383691

In [40]:
accuracy_score(y_test,np.round(test_pred[:,1]))

0.64925

In [41]:
log_loss(y_test, test_pred[:,1])

0.6264990652423544

In [42]:
for c in range(len(final_layer.feature_importances_)):
    print(best_model[-2].get_feature_names_out()[c]+": "+str(final_layer.feature_importances_[c]))

MissingNumeric__number_inpatient: 0.11766388601853096
MissingNumeric__number_diagnoses: 0.05651995466801387
MissingNumeric__num_lab_procedures: 0.07958848827475952
MissingNumeric__num_medications: 0.058886398535599155
MissingNumeric__time_in_hospital: 0.03389843139842822
MissingNumeric__num_procedures: 0.02524360817455378
MissingNumeric__number_outpatient: 0.035270762533715826
MissingNumeric__number_emergency: 0.01053680648917083
TextStacking_#diag_1_desc__diag_1_desc: 0.09152523272573775
TextStacking_#diag_3_desc__diag_3_desc: 0.07429542335215299
TextStacking_#admission_source_id__admission_source_id: 0.0583298065011025
TextStacking_#discharge_disposition_id__discharge_disposition_id: 0.08161282282238973
TextStacking_#age__age: 0.05187644422617041
TextStacking_#admission_type_id__admission_type_id: 0.05029022539131483
TextStacking_#medical_specialty__medical_specialty: 0.07003866366694102
TextStacking_#diag_2_desc__diag_2_desc: 0.10442304522141868


###  5) Save best model as .pkl file

In [43]:
# import sklearn.external.joblib as extjoblib
import joblib

joblib.dump(best_model, 'pipeline.pkl')

['pipeline.pkl']