## AutoML Process
###  1) Read data
###  2) Determine numeric, categorical, and text
###  3) Define partitioning method and CV method
###  4) Build out the pipeline on training data
### &nbsp;&nbsp;  4a) Transform Categorical Data using ordinal (frequency or alphabetical) other encoding such as binary one-hot
### &nbsp;&nbsp;  4b) Model Stacking on Text Data using tf-idf+ElasticNet - replace empty with nan
### &nbsp;&nbsp;  4c) Missing numerical: for tree set as large number outside of range (maybe pick side based on correlation) 
###  5) Definte gridsearch 
###  6) Estimate model and test
###  7) Save as .pkl file

In [3]:
from copy import copy

import numpy as np
from numpy import inf
import pandas as pd
from datetime import datetime
import functools
import matplotlib.pyplot as plt  

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  #RF and GBM algorithm
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn import preprocessing, neighbors, metrics
import sklearn
if sklearn.__version__<'0.20':
    from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold, PredefinedSplit
else:
    from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, PredefinedSplit

import scipy.stats as st
from sklearn.metrics import mean_absolute_error, accuracy_score, log_loss, make_scorer, auc, roc_auc_score

import joblib
import yaml

import xgboost as xgb

%matplotlib inline

In [4]:
import automl
from automl import StackLayer, TextElasticNetBinary, MissingDataHandler

In [5]:
# import importlib
# importlib.reload(automl)

In [6]:
#https://www.hindawi.com/journals/bmri/2014/781670/
input_data=pd.read_csv("diabetes_data.csv")
input_data.sample(5)

Unnamed: 0,rowID,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
6982,6983,Caucasian,Male,[80-90),?,Emergency,Discharged to home,Emergency Room,4,?,...,No,No,No,No,Ch,Yes,True,"Congestive heart failure, unspecified","Hypertensive chronic kidney disease, malignant...","Chronic airway obstruction, not elsewhere clas..."
7195,7196,Caucasian,Male,[60-70),?,Urgent,Discharged to home,Emergency Room,2,MC,...,No,No,No,No,Ch,Yes,True,Diabetes mellitus without mention of complicat...,Pure hypercholesterolemia,Malignant essential hypertension
9453,9454,Caucasian,Male,[30-40),?,Urgent,Discharged to home,Physician Referral,1,?,...,No,No,No,No,No,No,True,Mechanical complication of unspecified cardiac...,Chronic kidney disease (ckd),Diabetes mellitus without mention of complicat...
1200,1201,Caucasian,Female,[60-70),?,Elective,Discharged/transferred to home with home healt...,Physician Referral,5,?,...,No,No,No,No,Ch,Yes,False,Coronary atherosclerosis of unspecified type o...,Sideroblastic anemia,"Osteoarthrosis, generalized, site unspecified"
7670,7671,Caucasian,Male,[90-100),?,Elective,,Physician Referral,4,?,...,No,No,No,No,No,No,False,"Congestive heart failure, unspecified","Chronic airway obstruction, not elsewhere clas...",Diabetes mellitus without mention of complicat...


In [7]:
print(input_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   rowID                     10000 non-null  int64 
 1   race                      10000 non-null  object
 2   gender                    10000 non-null  object
 3   age                       10000 non-null  object
 4   weight                    10000 non-null  object
 5   admission_type_id         9279 non-null   object
 6   discharge_disposition_id  9531 non-null   object
 7   admission_source_id       9064 non-null   object
 8   time_in_hospital          10000 non-null  int64 
 9   payer_code                10000 non-null  object
 10  medical_specialty         10000 non-null  object
 11  num_lab_procedures        10000 non-null  int64 
 12  num_procedures            10000 non-null  int64 
 13  num_medications           10000 non-null  int64 
 14  number_outpatient      

In [8]:
input_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
time_in_hospital,10000.0,4.4347,3.021597,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,10000.0,43.0786,19.453315,1.0,32.0,44.0,57.0,120.0
num_procedures,10000.0,1.3992,1.706438,0.0,0.0,1.0,2.0,6.0
num_medications,10000.0,15.5638,8.391613,1.0,10.0,14.0,19.0,81.0
number_outpatient,10000.0,0.2817,1.119406,0.0,0.0,0.0,0.0,36.0
number_emergency,10000.0,0.115,0.649475,0.0,0.0,0.0,0.0,42.0
number_inpatient,10000.0,0.3873,0.854267,0.0,0.0,0.0,0.0,10.0
number_diagnoses,10000.0,7.0253,2.020957,1.0,5.0,7.0,9.0,9.0


In [9]:
target='readmitted'
feature_list=['number_inpatient','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency',
              'discharge_disposition_id','medical_specialty',"number_diagnoses","num_lab_procedures","admission_source_id",
              'age', 'diag_1_desc','diag_2_desc',"diag_3_desc",'admission_type_id']#'diag_1',"diag_2","diag_3"
num_features=['number_inpatient', 'number_diagnoses','num_lab_procedures','num_medications','time_in_hospital','num_procedures','number_outpatient','number_emergency']
text_features=list(set(feature_list)^set(num_features))
text_features

['medical_specialty',
 'admission_source_id',
 'diag_3_desc',
 'diag_1_desc',
 'admission_type_id',
 'diag_2_desc',
 'age',
 'discharge_disposition_id']

In [10]:
X=input_data[feature_list]
y=input_data[target].astype(int)

# Define partioning method and CV method

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [12]:
#split the data into 5 time cross validation folds
ps = PredefinedSplit(test_fold=np.floor(5*np.arange(len(X_train))/len(X_train)))
ps.unique_folds

array([0, 1, 2, 3, 4])

# Define preprocessing based on train

In [13]:
feature_lists_2=[]
drop_text_list=[]
sss=[]

for feat in list(text_features):    
    print(feat)
    if X_train[feat].nunique()>1:
        feature_lists_2.append(feat)
        sss_temp=StackLayer(models=[TextElasticNetBinary(token_pattern=r'(?u)\b\w+\b')],feature_lists=[feat], gridsearch_lists=[{'penalty_C':list(np.arange(1,10.1,4)),'ngram_range':list([(1, 1),(1,2)])}],regression=False,verbose=0)
        sss_temp.tune_hyperparameters(X_train, y_train, metric=None, n_jobs=1, cv=ps)
        sss.append(sss_temp)
    else:
        print("Not unique values for "+feat)
        drop_text_list.append([feat])

medical_specialty




admission_source_id




diag_3_desc




diag_1_desc




admission_type_id




diag_2_desc




age




discharge_disposition_id




In [14]:
feature_lists_2

['medical_specialty',
 'admission_source_id',
 'diag_3_desc',
 'diag_1_desc',
 'admission_type_id',
 'diag_2_desc',
 'age',
 'discharge_disposition_id']

In [15]:
mmm=MissingDataHandler(method='median',create_dummy=True)
gbm=GradientBoostingClassifier(learning_rate=.05,random_state=1234)

#Fitting XGB regressor 
#model = xgb.XGBRegressor()
gbm = xgb.XGBClassifier(colsample_bylevel=1, colsample_bynode=1, colsample_bytree=.3, gamma=0,#0.005,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=-99999, n_estimators=300, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [16]:
pipe = Pipeline(
    [   ("Preprocessing",ColumnTransformer([( "MissingNumeric", mmm, num_features)]+[("TextStacking_#"+feature_lists_2[fff], sss[fff], [feature_lists_2[fff]]) for fff in range(len(feature_lists_2))])),#,remainder='passthrough'
        # Use a SVC classifier on the combined features
        ("gbm", gbm),
    ],
    verbose=True,
)

In [17]:
# history = model.fit(train_x, train_y, eval_set=[(test_x, test_y)],early_stopping_rounds=25) 
# print("--- %s seconds ---" % (time.time() - start_time))
# print(history)
# print(model)

In [18]:
len(feature_lists_2)

8

In [19]:
pipe_trf = Pipeline(pipe.steps[:-1])
params_fit={}
params_fit['gbm__eval_set'] = [(pipe_trf.fit_transform(X_train[num_features+[fff for fff in feature_lists_2]], y_train), y_train),
                                (pipe_trf.transform(pd.DataFrame(X_test[num_features+[fff for fff in feature_lists_2]])),
                                 y_test)]
params_fit['gbm__early_stopping_rounds'] = 10

# pipe = pipe.fit(X_train[num_features+[fff for fff in feature_lists_2]],y_train, **params_fit)



In [20]:
pipe_trf.steps==pipe.steps[:-1]

True

In [21]:
param_grid = {
    'gbm__max_depth':list(range(3,5,1))
}
# params_fit['gbm__max_depth']=list(range(3,6,2))

In [22]:
search = GridSearchCV(pipe,param_grid=param_grid, n_jobs=2,verbose=1)
search.fit(X_train[num_features+[fff for fff in feature_lists_2]], y_train, **params_fit)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67181	validation_1-logloss:0.66489
[1]	validation_0-logloss:0.66986	validation_1-logloss:0.66319
[2]	validation_0-logloss:0.66859	validation_1-logloss:0.66202
[3]	validation_0-logloss:0.66740	validation_1-logloss:0.66113
[4]	validation_0-logloss:0.66586	validation_1-logloss:0.65962
[5]	validation_0-logloss:0.66277	validation_1-logloss:0.65656
[6]	validation_0-logloss:0.66163	validation_1-logloss:0.65532
[7]	validation_0-logloss:0.66040	validation_1-logloss:0.65474
[8]	validation_0-logloss:0.65899	validation_1-logloss:0.65373
[9]	validation_0-logloss:0.65780	validation_1-logloss:0.65293
[10]	validation_0-logloss:0.65596	validation_1-logloss:0.65131
[11]	validation_0-logloss:0.65496	validation_1-logloss:0.65062
[12]	validation_0-logloss:0.65421	validation_1-logloss:0.64990
[13]	validation_0-logloss:0.65323	validation_1-logloss:0.64904
[14]	validation_0-logloss:0.65225	validation_1-logloss:0.



[89]	validation_0-logloss:0.61071	validation_1-logloss:0.61399
[90]	validation_0-logloss:0.61059	validation_1-logloss:0.61393
[91]	validation_0-logloss:0.61050	validation_1-logloss:0.61406
[92]	validation_0-logloss:0.61037	validation_1-logloss:0.61397
[93]	validation_0-logloss:0.61028	validation_1-logloss:0.61399
[94]	validation_0-logloss:0.61012	validation_1-logloss:0.61383
[95]	validation_0-logloss:0.61000	validation_1-logloss:0.61368
[96]	validation_0-logloss:0.60985	validation_1-logloss:0.61359
[97]	validation_0-logloss:0.60976	validation_1-logloss:0.61355
[98]	validation_0-logloss:0.60947	validation_1-logloss:0.61346
[99]	validation_0-logloss:0.60940	validation_1-logloss:0.61335
[100]	validation_0-logloss:0.60917	validation_1-logloss:0.61312
[101]	validation_0-logloss:0.60890	validation_1-logloss:0.61305
[102]	validation_0-logloss:0.60874	validation_1-logloss:0.61300
[103]	validation_0-logloss:0.60853	validation_1-logloss:0.61293
[104]	validation_0-logloss:0.60847	validation_1-log



[93]	validation_0-logloss:0.61058	validation_1-logloss:0.61191
[94]	validation_0-logloss:0.61053	validation_1-logloss:0.61181
[95]	validation_0-logloss:0.61045	validation_1-logloss:0.61163
[96]	validation_0-logloss:0.61023	validation_1-logloss:0.61145
[97]	validation_0-logloss:0.61004	validation_1-logloss:0.61126
[98]	validation_0-logloss:0.60976	validation_1-logloss:0.61118
[99]	validation_0-logloss:0.60955	validation_1-logloss:0.61086
[100]	validation_0-logloss:0.60932	validation_1-logloss:0.61055
[101]	validation_0-logloss:0.60905	validation_1-logloss:0.61051
[102]	validation_0-logloss:0.60902	validation_1-logloss:0.61041
[103]	validation_0-logloss:0.60877	validation_1-logloss:0.61044
[104]	validation_0-logloss:0.60863	validation_1-logloss:0.61027
[105]	validation_0-logloss:0.60840	validation_1-logloss:0.61012
[106]	validation_0-logloss:0.60842	validation_1-logloss:0.61017
[107]	validation_0-logloss:0.60822	validation_1-logloss:0.61000
[108]	validation_0-logloss:0.60808	validation_1



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67180	validation_1-logloss:0.66510
[1]	validation_0-logloss:0.66993	validation_1-logloss:0.66333
[2]	validation_0-logloss:0.66889	validation_1-logloss:0.66194
[3]	validation_0-logloss:0.66762	validation_1-logloss:0.66099
[4]	validation_0-logloss:0.66597	validation_1-logloss:0.65974
[5]	validation_0-logloss:0.66296	validation_1-logloss:0.65678
[6]	validation_0-logloss:0.66198	validation_1-logloss:0.65572
[7]	validation_0-logloss:0.66074	validation_1-logloss:0.65510
[8]	validation_0-logloss:0.65929	validation_1-logloss:0.65385
[9]	validation_0-logloss:0.65801	validation_1-logloss:0.65287
[10]	validation_0-logloss:0.65605	validation_1-logloss:0.65118
[11]	validation_0-logloss:0.65491	validation_1-logloss:0.65008
[12]	validation_0-logloss:0.65429	validation_1-logloss:0.64907
[13]	validation_0-logloss:0.65335	validation_1-logloss:0.64817
[14]	validation_0-logloss:0.65236	validation_1-logloss:0.



[153]	validation_0-logloss:0.60585	validation_1-logloss:0.60904
[154]	validation_0-logloss:0.60573	validation_1-logloss:0.60898
[155]	validation_0-logloss:0.60557	validation_1-logloss:0.60893
[156]	validation_0-logloss:0.60548	validation_1-logloss:0.60892
[157]	validation_0-logloss:0.60546	validation_1-logloss:0.60886
[158]	validation_0-logloss:0.60543	validation_1-logloss:0.60891
[159]	validation_0-logloss:0.60534	validation_1-logloss:0.60881
[160]	validation_0-logloss:0.60519	validation_1-logloss:0.60869
[161]	validation_0-logloss:0.60508	validation_1-logloss:0.60853
[162]	validation_0-logloss:0.60506	validation_1-logloss:0.60856
[163]	validation_0-logloss:0.60500	validation_1-logloss:0.60852
[164]	validation_0-logloss:0.60498	validation_1-logloss:0.60850
[165]	validation_0-logloss:0.60494	validation_1-logloss:0.60852
[166]	validation_0-logloss:0.60489	validation_1-logloss:0.60852
[167]	validation_0-logloss:0.60477	validation_1-logloss:0.60844
[168]	validation_0-logloss:0.60478	valid



[124]	validation_0-logloss:0.60573	validation_1-logloss:0.61151
[125]	validation_0-logloss:0.60564	validation_1-logloss:0.61138
[126]	validation_0-logloss:0.60561	validation_1-logloss:0.61140
[127]	validation_0-logloss:0.60558	validation_1-logloss:0.61148
[128]	validation_0-logloss:0.60554	validation_1-logloss:0.61148
[129]	validation_0-logloss:0.60533	validation_1-logloss:0.61148
[130]	validation_0-logloss:0.60528	validation_1-logloss:0.61144
[131]	validation_0-logloss:0.60514	validation_1-logloss:0.61132
[132]	validation_0-logloss:0.60503	validation_1-logloss:0.61128
[133]	validation_0-logloss:0.60491	validation_1-logloss:0.61121
[134]	validation_0-logloss:0.60477	validation_1-logloss:0.61126
[135]	validation_0-logloss:0.60472	validation_1-logloss:0.61137
[136]	validation_0-logloss:0.60451	validation_1-logloss:0.61123
[137]	validation_0-logloss:0.60448	validation_1-logloss:0.61134
[138]	validation_0-logloss:0.60447	validation_1-logloss:0.61129
[139]	validation_0-logloss:0.60437	valid



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67173	validation_1-logloss:0.66490
[1]	validation_0-logloss:0.66992	validation_1-logloss:0.66318
[2]	validation_0-logloss:0.66876	validation_1-logloss:0.66197
[3]	validation_0-logloss:0.66745	validation_1-logloss:0.66090
[4]	validation_0-logloss:0.66583	validation_1-logloss:0.65983
[5]	validation_0-logloss:0.66288	validation_1-logloss:0.65684
[6]	validation_0-logloss:0.66187	validation_1-logloss:0.65571
[7]	validation_0-logloss:0.66061	validation_1-logloss:0.65508
[8]	validation_0-logloss:0.65913	validation_1-logloss:0.65395
[9]	validation_0-logloss:0.65800	validation_1-logloss:0.65308
[10]	validation_0-logloss:0.65626	validation_1-logloss:0.65146
[11]	validation_0-logloss:0.65528	validation_1-logloss:0.65037
[12]	validation_0-logloss:0.65448	validation_1-logloss:0.64962
[13]	validation_0-logloss:0.65347	validation_1-logloss:0.64868
[14]	validation_0-logloss:0.65251	validation_1-logloss:0.



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67159	validation_1-logloss:0.66475
[1]	validation_0-logloss:0.66945	validation_1-logloss:0.66284
[2]	validation_0-logloss:0.66796	validation_1-logloss:0.66123
[3]	validation_0-logloss:0.66665	validation_1-logloss:0.66012
[4]	validation_0-logloss:0.66473	validation_1-logloss:0.65857
[5]	validation_0-logloss:0.66157	validation_1-logloss:0.65539
[6]	validation_0-logloss:0.66024	validation_1-logloss:0.65400
[7]	validation_0-logloss:0.65881	validation_1-logloss:0.65344
[8]	validation_0-logloss:0.65715	validation_1-logloss:0.65220
[9]	validation_0-logloss:0.65572	validation_1-logloss:0.65116
[10]	validation_0-logloss:0.65383	validation_1-logloss:0.64969
[11]	validation_0-logloss:0.65266	validation_1-logloss:0.64862
[12]	validation_0-logloss:0.65186	validation_1-logloss:0.64770
[13]	validation_0-logloss:0.65080	validation_1-logloss:0.64685
[14]	validation_0-logloss:0.64977	validation_1-logloss:0.



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67158	validation_1-logloss:0.66483
[1]	validation_0-logloss:0.66936	validation_1-logloss:0.66285
[2]	validation_0-logloss:0.66800	validation_1-logloss:0.66143
[3]	validation_0-logloss:0.66666	validation_1-logloss:0.66040
[4]	validation_0-logloss:0.66495	validation_1-logloss:0.65892
[5]	validation_0-logloss:0.66168	validation_1-logloss:0.65562
[6]	validation_0-logloss:0.66036	validation_1-logloss:0.65430
[7]	validation_0-logloss:0.65877	validation_1-logloss:0.65363
[8]	validation_0-logloss:0.65715	validation_1-logloss:0.65255
[9]	validation_0-logloss:0.65582	validation_1-logloss:0.65156
[10]	validation_0-logloss:0.65387	validation_1-logloss:0.64997
[11]	validation_0-logloss:0.65275	validation_1-logloss:0.64923
[12]	validation_0-logloss:0.65190	validation_1-logloss:0.64823
[13]	validation_0-logloss:0.65085	validation_1-logloss:0.64729
[14]	validation_0-logloss:0.64975	validation_1-logloss:0.



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.3min
[0]	validation_0-logloss:0.67157	validation_1-logloss:0.66497
[1]	validation_0-logloss:0.66961	validation_1-logloss:0.66330
[2]	validation_0-logloss:0.66802	validation_1-logloss:0.66209
[3]	validation_0-logloss:0.66673	validation_1-logloss:0.66105
[4]	validation_0-logloss:0.66472	validation_1-logloss:0.65952
[5]	validation_0-logloss:0.66165	validation_1-logloss:0.65662
[6]	validation_0-logloss:0.66032	validation_1-logloss:0.65530
[7]	validation_0-logloss:0.65890	validation_1-logloss:0.65475
[8]	validation_0-logloss:0.65732	validation_1-logloss:0.65372
[9]	validation_0-logloss:0.65596	validation_1-logloss:0.65282
[10]	validation_0-logloss:0.65411	validation_1-logloss:0.65119
[11]	validation_0-logloss:0.65293	validation_1-logloss:0.65022
[12]	validation_0-logloss:0.65207	validation_1-logloss:0.64934
[13]	validation_0-logloss:0.65098	validation_1-logloss:0.64868
[14]	validation_0-logloss:0.64988	validation_1-logloss:0.



[146]	validation_0-logloss:0.59599	validation_1-logloss:0.61203
[147]	validation_0-logloss:0.59585	validation_1-logloss:0.61193
[148]	validation_0-logloss:0.59577	validation_1-logloss:0.61201
[149]	validation_0-logloss:0.59556	validation_1-logloss:0.61204
[150]	validation_0-logloss:0.59544	validation_1-logloss:0.61187
[151]	validation_0-logloss:0.59527	validation_1-logloss:0.61180
[152]	validation_0-logloss:0.59512	validation_1-logloss:0.61179
[153]	validation_0-logloss:0.59500	validation_1-logloss:0.61154
[154]	validation_0-logloss:0.59490	validation_1-logloss:0.61158
[155]	validation_0-logloss:0.59488	validation_1-logloss:0.61147
[156]	validation_0-logloss:0.59476	validation_1-logloss:0.61159
[157]	validation_0-logloss:0.59473	validation_1-logloss:0.61166
[158]	validation_0-logloss:0.59450	validation_1-logloss:0.61185
[159]	validation_0-logloss:0.59443	validation_1-logloss:0.61178
[160]	validation_0-logloss:0.59433	validation_1-logloss:0.61177
[161]	validation_0-logloss:0.59436	valid



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67161	validation_1-logloss:0.66495
[1]	validation_0-logloss:0.66953	validation_1-logloss:0.66293
[2]	validation_0-logloss:0.66826	validation_1-logloss:0.66128
[3]	validation_0-logloss:0.66695	validation_1-logloss:0.66022
[4]	validation_0-logloss:0.66493	validation_1-logloss:0.65876
[5]	validation_0-logloss:0.66177	validation_1-logloss:0.65576
[6]	validation_0-logloss:0.66067	validation_1-logloss:0.65449
[7]	validation_0-logloss:0.65917	validation_1-logloss:0.65366
[8]	validation_0-logloss:0.65759	validation_1-logloss:0.65229
[9]	validation_0-logloss:0.65611	validation_1-logloss:0.65126
[10]	validation_0-logloss:0.65403	validation_1-logloss:0.64958
[11]	validation_0-logloss:0.65260	validation_1-logloss:0.64852
[12]	validation_0-logloss:0.65202	validation_1-logloss:0.64735
[13]	validation_0-logloss:0.65094	validation_1-logloss:0.64650
[14]	validation_0-logloss:0.64989	validation_1-logloss:0.



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.2min
[0]	validation_0-logloss:0.67154	validation_1-logloss:0.66470
[1]	validation_0-logloss:0.66954	validation_1-logloss:0.66276
[2]	validation_0-logloss:0.66801	validation_1-logloss:0.66118
[3]	validation_0-logloss:0.66662	validation_1-logloss:0.66007
[4]	validation_0-logloss:0.66478	validation_1-logloss:0.65881
[5]	validation_0-logloss:0.66169	validation_1-logloss:0.65571
[6]	validation_0-logloss:0.66044	validation_1-logloss:0.65421
[7]	validation_0-logloss:0.65888	validation_1-logloss:0.65352
[8]	validation_0-logloss:0.65725	validation_1-logloss:0.65248
[9]	validation_0-logloss:0.65586	validation_1-logloss:0.65143
[10]	validation_0-logloss:0.65422	validation_1-logloss:0.64993
[11]	validation_0-logloss:0.65303	validation_1-logloss:0.64837
[12]	validation_0-logloss:0.65225	validation_1-logloss:0.64770
[13]	validation_0-logloss:0.65121	validation_1-logloss:0.64677
[14]	validation_0-logloss:0.65020	validation_1-logloss:0.



[Pipeline] ..... (step 1 of 2) Processing Preprocessing, total= 1.8min
[0]	validation_0-logloss:0.67134	validation_1-logloss:0.66470
[1]	validation_0-logloss:0.66910	validation_1-logloss:0.66276
[2]	validation_0-logloss:0.66728	validation_1-logloss:0.66138
[3]	validation_0-logloss:0.66577	validation_1-logloss:0.66024
[4]	validation_0-logloss:0.66371	validation_1-logloss:0.65875
[5]	validation_0-logloss:0.66042	validation_1-logloss:0.65590
[6]	validation_0-logloss:0.65900	validation_1-logloss:0.65449
[7]	validation_0-logloss:0.65748	validation_1-logloss:0.65383
[8]	validation_0-logloss:0.65586	validation_1-logloss:0.65269
[9]	validation_0-logloss:0.65417	validation_1-logloss:0.65128




[10]	validation_0-logloss:0.65199	validation_1-logloss:0.64978
[11]	validation_0-logloss:0.65065	validation_1-logloss:0.64869
[12]	validation_0-logloss:0.64955	validation_1-logloss:0.64776
[13]	validation_0-logloss:0.64842	validation_1-logloss:0.64701
[14]	validation_0-logloss:0.64713	validation_1-logloss:0.64588
[15]	validation_0-logloss:0.64544	validation_1-logloss:0.64437
[16]	validation_0-logloss:0.64454	validation_1-logloss:0.64380
[17]	validation_0-logloss:0.64340	validation_1-logloss:0.64277
[18]	validation_0-logloss:0.64226	validation_1-logloss:0.64180
[19]	validation_0-logloss:0.64140	validation_1-logloss:0.64116
[20]	validation_0-logloss:0.63877	validation_1-logloss:0.63879
[21]	validation_0-logloss:0.63770	validation_1-logloss:0.63795
[22]	validation_0-logloss:0.63597	validation_1-logloss:0.63641
[23]	validation_0-logloss:0.63475	validation_1-logloss:0.63527
[24]	validation_0-logloss:0.63382	validation_1-logloss:0.63446
[25]	validation_0-logloss:0.63270	validation_1-logloss:

In [23]:
search.cv_results_

{'mean_fit_time': array([74.55978432, 74.19890904]),
 'std_fit_time': array([2.98052242, 1.57871439]),
 'mean_score_time': array([0.09182014, 0.07949972]),
 'std_score_time': array([0.01702511, 0.01414406]),
 'param_gbm__max_depth': masked_array(data=[3, 4],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'gbm__max_depth': 3}, {'gbm__max_depth': 4}],
 'split0_test_score': array([0.660625, 0.65875 ]),
 'split1_test_score': array([0.645   , 0.646875]),
 'split2_test_score': array([0.6675, 0.67  ]),
 'split3_test_score': array([0.664375, 0.665625]),
 'split4_test_score': array([0.64875, 0.65   ]),
 'mean_test_score': array([0.65725, 0.65825]),
 'std_test_score': array([0.00882645, 0.00883353]),
 'rank_test_score': array([2, 1], dtype=int32)}

In [24]:
roc_auc_score(y_test,search.best_estimator_.predict_proba(X_test[num_features+[fff for fff in feature_lists_2]])[:,1])

0.7022059912572838

In [26]:
joblib.dump(search.best_estimator_, 'pipeline_xgb.pkl')

['pipeline_xgb.pkl']

In [27]:
type(search.best_estimator_)

sklearn.pipeline.Pipeline

In [28]:
dict_file=[{'feature_list':num_features+[fff for fff in feature_lists_2]},{'model_file':"pipeline_xgb.pkl"}]
with open(r'model_file.yaml', 'w') as file:
    documents = yaml.dump(dict_file, file)

In [29]:
with open(r'model_file.yaml') as file:
    documents = yaml.full_load(file)
    print(documents)

    for ddd in documents:
        print(ddd.keys())

[{'feature_list': ['number_inpatient', 'number_diagnoses', 'num_lab_procedures', 'num_medications', 'time_in_hospital', 'num_procedures', 'number_outpatient', 'number_emergency', 'medical_specialty', 'admission_source_id', 'diag_3_desc', 'diag_1_desc', 'admission_type_id', 'diag_2_desc', 'age', 'discharge_disposition_id']}, {'model_file': 'pipeline_xgb.pkl'}]
dict_keys(['feature_list'])
dict_keys(['model_file'])


In [30]:
mmm=search.best_estimator_.steps[-1][1]
mmm.best_iteration

161

In [31]:
final_layer=search.best_estimator_[-1]
final_layer

In [32]:
for c in range(len(final_layer.feature_importances_)):
    print(X_train[num_features+[fff for fff in feature_lists_2]].columns[c]+": "+str(final_layer.feature_importances_[c]))

number_inpatient: 0.19275641
number_diagnoses: 0.11179899
num_lab_procedures: 0.04194721
num_medications: 0.028055694
time_in_hospital: 0.028323285
num_procedures: 0.026455224
number_outpatient: 0.10191794
number_emergency: 0.06134672
medical_specialty: 0.06294054
admission_source_id: 0.042034157
diag_3_desc: 0.03731555
diag_1_desc: 0.041612193
admission_type_id: 0.03793671
diag_2_desc: 0.039701574
age: 0.03883462
discharge_disposition_id: 0.10702311


In [33]:
X_test[num_features+[fff for fff in feature_lists_2]]

Unnamed: 0,number_inpatient,number_diagnoses,num_lab_procedures,num_medications,time_in_hospital,num_procedures,number_outpatient,number_emergency,medical_specialty,admission_source_id,diag_3_desc,diag_1_desc,admission_type_id,diag_2_desc,age,discharge_disposition_id
2374,0,9,62,40,14,6,0,0,InternalMedicine,Emergency Room,Pulmonary collapse,"Diabetes with ketoacidosis, type II or unspeci...",Emergency,Paroxysmal supraventricular tachycardia,[40-50),Discharged to home
1784,0,5,1,16,3,3,6,0,?,,Malignant essential hypertension,"Obesity, unspecified",Not Available,Pure hypercholesterolemia,[50-60),Discharged to home
6301,0,9,54,14,7,0,0,0,InternalMedicine,Emergency Room,Coronary atherosclerosis of unspecified type o...,Simple chronic bronchitis,Emergency,Coronary atherosclerosis of unspecified type o...,[80-90),Discharged/transferred to another type of inp...
1600,0,5,45,36,4,5,0,0,?,Physician Referral,Diabetes mellitus without mention of complicat...,Coronary atherosclerosis of unspecified type o...,Elective,Postmyocardial infarction syndrome,[50-60),
7920,1,9,53,14,12,0,0,0,?,Transfer from another health care facility,Cerebral atherosclerosis,Care involving breathing exercises,Elective,Cerebral thrombosis without mention of cerebra...,[60-70),Discharged/transferred to home with home healt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8623,0,9,4,13,1,0,0,0,?,Emergency Room,Headache,"Respiratory abnormality, unspecified",Emergency,"Urinary tract infection, site not specified",[70-80),Discharged/transferred to ICF
5928,0,9,50,15,3,0,0,0,?,Emergency Room,"Hypertensive chronic kidney disease, malignant...",Streptococcal septicemia,Urgent,Acute kidney failure,[60-70),Discharged/transferred to another short term h...
6714,0,6,64,13,3,0,0,0,?,Emergency Room,Alteration of consciousness,Acute myocardial infarction of anterolateral w...,Emergency,"Congestive heart failure, unspecified",[80-90),Discharged/transferred to SNF
5885,0,7,25,7,3,0,0,0,Emergency/Trauma,Emergency Room,Chronic maxillary sinusitis,Paroxysmal supraventricular tachycardia,Elective,Acute laryngopharyngitis,[50-60),Discharged to home


In [34]:
#get test set text predictions
test_pred=search.predict_proba(X_test[num_features+[fff for fff in feature_lists_2]])


In [35]:
test_pred=search.best_estimator_.predict_proba(X_test[num_features+[fff for fff in feature_lists_2]])


In [36]:
roc_auc_score(y_test,test_pred[:,1])

0.7022059912572838

In [37]:
log_loss(y_test, test_pred[:,1])

0.6045116858714573