## AutoML Process
###  1) Import Libraries and Read data
###  2) Define target and determine numeric, categorical, and text
###  3) Define partitioning method and CV method
###  4) Run Competition
###  5) Save best model as .pkl file


## Let's begin!

###  1) Import Libraries and Read data

In [8]:
from copy import copy
import joblib

import numpy as np
from numpy import inf
import pandas as pd
from datetime import datetime
import functools
import matplotlib.pyplot as plt  

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  #RF and GBM algorithm
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn import preprocessing, neighbors, metrics
import sklearn
if sklearn.__version__<'0.20':
    from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold, PredefinedSplit
else:
    from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, PredefinedSplit

import scipy.stats as st
from sklearn.metrics import mean_absolute_error, accuracy_score, log_loss, make_scorer, auc, roc_auc_score


%matplotlib inline

In [9]:
import automl
from automl import automl_utils
from automl import StackLayer, TextElasticNetBinary, MissingDataHandler

In [10]:
# from pipelines import gbm_classifier_pipeline, rf_classifier_pipeline, all_tree_classifier_pipeline

In [11]:
# import importlib
# importlib.reload(automl)

In [12]:
#https://www.hindawi.com/journals/bmri/2014/781670/
input_data=pd.read_csv("diabetes_data.csv")
input_data.sample(5)

Unnamed: 0,rowID,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
1338,1339,Caucasian,Female,[40-50),?,Elective,,Transfer from a hospital,10,?,...,No,No,No,No,Ch,Yes,False,Coronary atherosclerosis of unspecified type o...,Postmyocardial infarction syndrome,Pure hypercholesterolemia
9848,9849,Caucasian,Female,[60-70),?,Emergency,Discharged to home,Emergency Room,2,CM,...,No,No,No,No,No,Yes,False,Malignant hypertensive heart disease without h...,Hyperosmolality and/or hypernatremia,"Obesity, unspecified"
2151,2152,Caucasian,Female,[50-60),?,Urgent,Discharged to home,Transfer from a hospital,3,?,...,No,No,No,No,No,No,False,"Respiratory abnormality, unspecified",Achalasia and cardiospasm,Femoral hernia without mention of obstruction ...
298,299,Caucasian,Female,[80-90),?,Emergency,Discharged/transferred to ICF,Transfer from another health care facility,2,HM,...,No,No,No,No,Ch,Yes,False,Amnestic disorder in conditions classified els...,Diabetes mellitus without mention of complicat...,Malignant essential hypertension
181,182,Caucasian,Male,[30-40),?,Emergency,Discharged/transferred to another short term h...,Emergency Room,5,?,...,No,No,No,No,No,Yes,False,"Congestive heart failure, unspecified",Endomyocardial fibrosis,Hyperosmolality and/or hypernatremia


In [13]:
target='readmitted'
model_file='pipeline.pkl'

best_model = joblib.load(model_file)

feature_list=list(best_model.feature_names_in_)

In [14]:
X=input_data[feature_list]
y=input_data[target]

###  2) Make predictions

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=1234)

In [17]:
#get test set text predictions
test_pred=best_model.predict_proba(X_test[list(best_model.feature_names_in_)])

In [18]:
roc_auc_score(y_test,test_pred[:,1])

0.6737188143206274

In [19]:
accuracy_score(y_test,np.round(test_pred[:,1]))

0.64925

In [20]:
log_loss(y_test, test_pred[:,1])

0.6264990652423544