## AutoML Process
###  1) Read data
###  2) Determine numeric, categorical, and text
###  3) Define partitioning method and CV method
###  4) Build out the pipeline on training data
### &nbsp;&nbsp;  4a) Transform Categorical Data using ordinal (frequency or alphabetical) other encoding such as binary one-hot
### &nbsp;&nbsp;  4b) Model Stacking on Text Data using tf-idf+ElasticNet - replace empty with nan
### &nbsp;&nbsp;  4c) Missing numerical: for tree set as large number outside of range (maybe pick side based on correlation) 
###  5) Definte gridsearch 
###  6) Estimate model and test
###  7) Save as .pkl file

In [1]:
from copy import copy

import numpy as np
from numpy import inf
import pandas as pd
from datetime import datetime
import functools
import matplotlib.pyplot as plt  

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  #RF and GBM algorithm
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn import preprocessing, neighbors, metrics
import sklearn
if sklearn.__version__<'0.20':
    from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold, PredefinedSplit
else:
    from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, PredefinedSplit

import scipy.stats as st
from sklearn.metrics import mean_absolute_error, accuracy_score, log_loss, make_scorer, auc, roc_auc_score


%matplotlib inline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
import automl
from automl import StackLayer, TextElasticNetBinary, MissingDataHandler

In [4]:
# import importlib
# importlib.reload(automl)

In [5]:
#https://www.hindawi.com/journals/bmri/2014/781670/
input_data=pd.read_csv("diabetes_data.csv")
input_data.sample(5)

Unnamed: 0,rowID,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
2661,2662,Caucasian,Female,[50-60),?,Emergency,Discharged to home,Emergency Room,7,?,...,No,No,No,No,No,No,True,Acute vascular insufficiency of intestine,"Congestive heart failure, unspecified","Pneumonia, organism unspecified"
3695,3696,Caucasian,Female,[80-90),[75-100),Emergency,Hospice / home,Emergency Room,6,MC,...,No,No,No,No,No,No,False,Acute myocardial infarction of anterolateral w...,"Hypertensive chronic kidney disease, malignant...",Chronic kidney disease (ckd)
5104,5105,Caucasian,Male,[50-60),?,Emergency,Left AMA,Emergency Room,2,MD,...,No,No,No,No,Ch,Yes,True,"Congestive heart failure, unspecified",Acute myocardial infarction of anterolateral w...,"Hypertensive chronic kidney disease, malignant..."
4188,4189,Caucasian,Female,[70-80),?,Urgent,Discharged to home,Physician Referral,3,?,...,No,No,No,No,No,Yes,True,"Pneumonia, organism unspecified",Pleurisy without mention of effusion or curren...,Mitral valve stenosis and aortic valve stenosis
8010,8011,AfricanAmerican,Female,[80-90),?,Emergency,Discharged/transferred to home with home healt...,Emergency Room,5,?,...,No,No,No,No,No,Yes,False,Hyperosmolality and/or hypernatremia,"Congestive heart failure, unspecified",Paroxysmal supraventricular tachycardia


In [6]:
print(input_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   rowID                     10000 non-null  int64 
 1   race                      10000 non-null  object
 2   gender                    10000 non-null  object
 3   age                       10000 non-null  object
 4   weight                    10000 non-null  object
 5   admission_type_id         9279 non-null   object
 6   discharge_disposition_id  9531 non-null   object
 7   admission_source_id       9064 non-null   object
 8   time_in_hospital          10000 non-null  int64 
 9   payer_code                10000 non-null  object
 10  medical_specialty         10000 non-null  object
 11  num_lab_procedures        10000 non-null  int64 
 12  num_procedures            10000 non-null  int64 
 13  num_medications           10000 non-null  int64 
 14  number_outpatient      

In [7]:
input_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
time_in_hospital,10000.0,4.4347,3.021597,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,10000.0,43.0786,19.453315,1.0,32.0,44.0,57.0,120.0
num_procedures,10000.0,1.3992,1.706438,0.0,0.0,1.0,2.0,6.0
num_medications,10000.0,15.5638,8.391613,1.0,10.0,14.0,19.0,81.0
number_outpatient,10000.0,0.2817,1.119406,0.0,0.0,0.0,0.0,36.0
number_emergency,10000.0,0.115,0.649475,0.0,0.0,0.0,0.0,42.0
number_inpatient,10000.0,0.3873,0.854267,0.0,0.0,0.0,0.0,10.0
number_diagnoses,10000.0,7.0253,2.020957,1.0,5.0,7.0,9.0,9.0


In [8]:
target='readmitted'
feature_list=['number_inpatient','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency',
              'discharge_disposition_id','medical_specialty',"number_diagnoses","num_lab_procedures","admission_source_id",
              'age', 'diag_1_desc','diag_2_desc',"diag_3_desc",'admission_type_id']#'diag_1',"diag_2","diag_3"
num_features=['number_inpatient', 'number_diagnoses','num_lab_procedures','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency']
text_features=list(set(feature_list)^set(num_features))
text_features

['medical_specialty',
 'admission_source_id',
 'diag_1_desc',
 'admission_type_id',
 'diag_2_desc',
 'discharge_disposition_id',
 'age',
 'diag_3_desc']

In [9]:
X=input_data[feature_list]
y=input_data[target].astype(int)

# Define partioning method and CV method

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [11]:
#split the data into 5 time cross validation folds
ps = PredefinedSplit(test_fold=np.floor(5*np.arange(len(X_train))/len(X_train)))
ps.unique_folds

array([0, 1, 2, 3, 4])

# Define preprocessing based on train

In [12]:
feature_lists_2=[]
drop_text_list=[]
sss=[]

for feat in list(text_features):    
    print(feat)
    if X_train[feat].nunique()>1:
        feature_lists_2.append(feat)
        sss_temp=StackLayer(models=[TextElasticNetBinary(token_pattern=r'(?u)\b\w+\b')],feature_lists=[feat], gridsearch_lists=[{'penalty_C':list(np.arange(1,10.1,4)),'ngram_range':list([(1, 1),(1,2)])}],regression=False,verbose=0)
        sss_temp.tune_hyperparameters(X_train, y_train, metric=None, n_jobs=1, cv=ps)
        sss.append(sss_temp)
    else:
        print("Not unique values for "+feat)
        drop_text_list.append([feat])

medical_specialty




admission_source_id




diag_1_desc




admission_type_id




diag_2_desc




discharge_disposition_id




age




diag_3_desc




In [13]:
feature_lists_2

['medical_specialty',
 'admission_source_id',
 'diag_1_desc',
 'admission_type_id',
 'diag_2_desc',
 'discharge_disposition_id',
 'age',
 'diag_3_desc']

In [14]:
mmm=MissingDataHandler(method='median',create_dummy=True)
gbm=GradientBoostingClassifier(learning_rate=.05,random_state=1234)

In [15]:
pipe = Pipeline(
    [   ("Preprocessing",ColumnTransformer([( "MissingNumeric", mmm, num_features)]+[("TextStacking_#"+feature_lists_2[fff], sss[fff], [feature_lists_2[fff]]) for fff in range(len(feature_lists_2))],remainder='passthrough')),
        # Use a SVC classifier on the combined features
        ("gbm", gbm),
    ],
    verbose=True,
)

In [16]:
# # Optimize entire pipeline over and over - INEFFICEINT
# models_2=[TextElasticNetBinary(token_pattern=r'(?u)\b\w+\b')]
# gridsearch_lists_2=[{'penalty_C':list(np.arange(1,10.1,4)),'ngram_range':list([(1, 1),(1,2)])}]
# sss=StackLayer(models=models_2,gridsearch_lists=gridsearch_lists_2,regression=False,verbose=1)
# pipe = Pipeline(
#     [   ("Preprocessing",ColumnTransformer([( "MissingNumeric", mmm, num_features)]+[("TextStacking_#"+fff, copy(sss), [fff]) for fff in feature_lists_2],remainder='passthrough')),
#         # Use a SVC classifier on the combined features
#         ("gbm", gbm),
#     ],
#     verbose=True,
# )

In [17]:
# pipe = Pipeline(
#     [   ("Preprocessing",ColumnTransformer([( "MissingNumeric", mmm, num_features),("Text",sss,['diag_1_desc'])])),
#         ("gbm", gbm),
#     ],
#     verbose=True,
# )

In [18]:
len(feature_lists_2)

8

In [19]:
[sss1.autotune for sss1 in sss]

[False, False, False, False, False, False, False, False]

In [20]:
# pipe.fit(X_train[num_features+[fff[0] for fff in feature_lists_2]], y_train)

In [21]:
param_grid = {
    "gbm__n_estimators": list(range(100,201,100)),
    'gbm__max_depth':list(range(3,6,2))
}
search = GridSearchCV(pipe, param_grid, n_jobs=None,verbose=1)
search.fit(X_train[num_features+[fff for fff in feature_lists_2]], y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.1min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.8s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.8min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.8s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.8s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.2min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.8s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.1min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   0.8s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.5s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.8min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.5s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.2min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.9min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.1min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.2min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.3s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.1min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.4s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   2.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.9min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   2.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.1min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   2.6s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.2min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   2.5s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.0min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   2.5s




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 2.8min
[Pipeline] ............... (step 2 of 2) Processing gbm, total=   1.6s
Best parameter (CV score=0.657):
{'gbm__max_depth': 5, 'gbm__n_estimators': 100}


In [22]:
search.cv_results_['mean_test_score'], search.best_params_, search.best_score_, search.best_estimator_

(array([0.65425 , 0.656   , 0.657125, 0.653125]),
 {'gbm__max_depth': 5, 'gbm__n_estimators': 100},
 0.6571250000000001,
 Pipeline(steps=[('Preprocessing',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('MissingNumeric',
                                                   MissingDataHandler(columns=[],
                                                                      create_dummy=True,
                                                                      imputation_funcs=[],
                                                                      imputation_values=[],
                                                                      method='median'),
                                                   ['number_inpatient',
                                                    'number_diagnoses',
                                                    'num_lab_procedures',
                                                    'n

In [23]:
final_layer=search.best_estimator_[-1]
final_layer

In [24]:
for c in range(len(final_layer.feature_importances_)):
    print(X_train[num_features+[fff for fff in feature_lists_2]].columns[c]+": "+str(final_layer.feature_importances_[c]))

number_inpatient: 0.1745032239770585
number_diagnoses: 0.0695767621912206
num_lab_procedures: 0.07101988209868798
num_medications: 0.040068094096367184
time_in_hospital: 0.025038865204598998
num_procedures: 0.0115267736914375
number_outpatient: 0.036028471897355495
number_emergency: 0.018557602001610187
medical_specialty: 0.07759779306783127
admission_source_id: 0.04211609448213232
diag_1_desc: 0.07751141399755976
admission_type_id: 0.038723585294654334
diag_2_desc: 0.053856869023289795
discharge_disposition_id: 0.14352104912550104
age: 0.05542311167376319
diag_3_desc: 0.06493040817693188


In [25]:
X_test[num_features+[fff for fff in feature_lists_2]]

Unnamed: 0,number_inpatient,number_diagnoses,num_lab_procedures,num_medications,time_in_hospital,num_procedures,number_outpatient,number_emergency,medical_specialty,admission_source_id,diag_1_desc,admission_type_id,diag_2_desc,discharge_disposition_id,age,diag_3_desc
2374,0,9,62,40,14,6,0,0,InternalMedicine,Emergency Room,"Diabetes with ketoacidosis, type II or unspeci...",Emergency,Paroxysmal supraventricular tachycardia,Discharged to home,[40-50),Pulmonary collapse
1784,0,5,1,16,3,3,6,0,?,,"Obesity, unspecified",Not Available,Pure hypercholesterolemia,Discharged to home,[50-60),Malignant essential hypertension
6301,0,9,54,14,7,0,0,0,InternalMedicine,Emergency Room,Simple chronic bronchitis,Emergency,Coronary atherosclerosis of unspecified type o...,Discharged/transferred to another type of inp...,[80-90),Coronary atherosclerosis of unspecified type o...
1600,0,5,45,36,4,5,0,0,?,Physician Referral,Coronary atherosclerosis of unspecified type o...,Elective,Postmyocardial infarction syndrome,,[50-60),Diabetes mellitus without mention of complicat...
7920,1,9,53,14,12,0,0,0,?,Transfer from another health care facility,Care involving breathing exercises,Elective,Cerebral thrombosis without mention of cerebra...,Discharged/transferred to home with home healt...,[60-70),Cerebral atherosclerosis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8623,0,9,4,13,1,0,0,0,?,Emergency Room,"Respiratory abnormality, unspecified",Emergency,"Urinary tract infection, site not specified",Discharged/transferred to ICF,[70-80),Headache
5928,0,9,50,15,3,0,0,0,?,Emergency Room,Streptococcal septicemia,Urgent,Acute kidney failure,Discharged/transferred to another short term h...,[60-70),"Hypertensive chronic kidney disease, malignant..."
6714,0,6,64,13,3,0,0,0,?,Emergency Room,Acute myocardial infarction of anterolateral w...,Emergency,"Congestive heart failure, unspecified",Discharged/transferred to SNF,[80-90),Alteration of consciousness
5885,0,7,25,7,3,0,0,0,Emergency/Trauma,Emergency Room,Paroxysmal supraventricular tachycardia,Elective,Acute laryngopharyngitis,Discharged to home,[50-60),Chronic maxillary sinusitis


In [26]:
#get test set text predictions
test_pred=search.predict_proba(X_test[num_features+[fff for fff in feature_lists_2]])


In [27]:
test_pred=search.best_estimator_.predict_proba(X_test[num_features+[fff for fff in feature_lists_2]])


In [28]:
# X_test_final=pd.concat([X_test[num_features],pd.DataFrame(test_pred,columns=text_features,index=X_test.index)],axis=1)

In [29]:
roc_auc_score(y_test,test_pred[:,1])

0.6983620102322787

In [30]:
log_loss(y_test, test_pred[:,1])

0.6078153539836869

In [31]:
#model much better than text aalone
# print(log_loss(y_test, test_pred[:,0]))
# print(log_loss(y_test, test_pred[:,1]))
# log_loss(y_test, test_pred[:,2])

In [32]:
type(search.best_estimator_)

sklearn.pipeline.Pipeline

In [33]:
# import sklearn.external.joblib as extjoblib
import joblib

joblib.dump(search.best_estimator_, 'pipeline.pkl')

['pipeline.pkl']