In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

from processing.preprocessors import (BinaryEncoder, CleanStrings,
                                      ColumnDropperTransformer)


In [2]:
# Importing the data
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
# Cleaning the columns
columns = [cols.lower().replace(" ","_") for cols in data]
data.columns = columns

In [4]:
# Storing the column definitions so can run them through the pipeline
id = 'id'
target = 'stroke'


categorical_columns = ['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'residence_type', 'smoking_status']

numerical_columns = ['avg_glucose_level', 'bmi']

missing_data_num_columns = ['bmi']

binary_encoder_column = "ever_married"

categorical_binary_classifier_column = ['heart_disease']

In [5]:
# Split the data into train and test
train, test = train_test_split(data, test_size=0.1, random_state=1)

train.shape, test.shape




((4599, 12), (511, 12))

In [6]:
# Defining the Pipeline Objects
preprocess_pipeline = Pipeline(
    [
        ("dropping_id_column",
        
            ColumnDropperTransformer(
                column_list=[id]
            )
        ),
        ("binary_encoder",
            BinaryEncoder(
                column_name=binary_encoder_column
            )
        ),
        ("cleaning_strings",
            CleanStrings(
                column_list=categorical_columns
            )
        )
    ])

transform_pipeline = Pipeline(
    [   
        ("dict_vectorizer",
            DictVectorizer(sparse=False)
        ),
        ("scaling_data",
            MinMaxScaler()
        ),
        ("multiple_numeric_values_imputation",
            KNNImputer(add_indicator=True)
        )
    ]
)

In [7]:
# Preprocessed data
preprocessed_train = preprocess_pipeline.fit_transform(train.drop([target],1))
preprocessed_test = preprocess_pipeline.transform(test.drop([target],1))

  preprocessed_train = preprocess_pipeline.fit_transform(train.drop([target],1))
  preprocessed_test = preprocess_pipeline.transform(test.drop([target],1))


In [8]:
# Converting dataframes to dict objects
train_dict = preprocessed_train.to_dict(orient='records')
test_dict = preprocessed_test.to_dict(orient='records')

In [9]:
# Transforming the data and getting ready for training the pipeline
X_train = transform_pipeline.fit_transform(train_dict)
X_test = transform_pipeline.transform(test_dict)

y_train = train[target]
y_test = test[target]

In [10]:
# Defining the models

majority_class_instances = len(y_train) - y_train.sum()
minority_class_instances = y_train.sum()
weight_adjust_pos = majority_class_instances/minority_class_instances

# Decision Tree
d_param_grid = {
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'min_samples_leaf' : [1,3,5,10,20],
    'criterion' : ['gini', 'entropy'],
    'random_state' : [1], 
    'class_weight' : ['balanced']
}
d_clf = DecisionTreeClassifier(random_state=1, class_weight='balanced')


# Logistic Regression
lr_param_grid = {
    "C":np.logspace(-3,3,7), 
    "max_iter": [500, 1000,2000, 5000],
    'class_weight' : ['balanced'],
    'random_state' : [1]
    } 
    
lr_clf = LogisticRegression()

# Xgboost
xgb_params = {
    'eta': [0.05, 0.1, 0.2],
    'max_depth': [4,5,6,7,8,10,20],
    'min_child_weight': [1,3,5,10,20],
    'n_estimators': [5, 10, 20, 50],
    'scale_pos_weight': [weight_adjust_pos],
    'objective':['binary:logistic'],
    'seed': [1],
    'verbosity': [1]
}

xgb_clf = xgb.XGBClassifier()


In [11]:
# Training the models
%time

d_clf_cv = GridSearchCV(estimator=d_clf, param_grid=d_param_grid, cv=5, scoring='roc_auc')
d_clf_cv.fit(X_train, y_train)

print("Decision tree optimised")

lr_clf_cv = GridSearchCV(estimator=lr_clf, param_grid=lr_param_grid, cv=5, scoring='roc_auc')
lr_clf_cv.fit(X_train, y_train)

print("Logistic regression optimised")


xgb_clf_cv = GridSearchCV(estimator=xgb_clf, param_grid=xgb_params, cv=5, scoring='roc_auc')
xgb_clf_cv.fit(X_train, y_train)

print("xgboost classifier optimised")

lr_best_params = lr_clf_cv.best_params_
d_best_params = d_clf_cv.best_params_
xgb_best_params = xgb_clf_cv.best_params_

CPU times: user 7 µs, sys: 18 µs, total: 25 µs
Wall time: 6.91 µs
Decision tree optimised
Logistic regression optimised
xgboost classifier optimised


In [12]:
# Training the best models
lr_best_clf = LogisticRegression(**lr_best_params)
d_best_clf = DecisionTreeClassifier(**d_best_params)
xgb_best_clf = xgb.XGBClassifier(**xgb_best_params)

lr_best_clf.fit(X_train, y_train)
d_best_clf.fit(X_train, y_train)
xgb_best_clf.fit(X_train, y_train)

In [13]:
# Evaluating all the models
# Evaluate the models
def evaluate(model, X_val, y_val):
    """Evaluation function to return recall"""

    predictions = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, predictions)
    return roc_auc


d_roc_auc = evaluate(d_best_clf, X_val=X_test, y_val=y_test)
lr_roc_auc = evaluate(lr_best_clf, X_val=X_test, y_val=y_test)
xgb_roc_auc = evaluate(xgb_best_clf, X_val=X_test, y_val=y_test)
d_roc_auc, lr_roc_auc, xgb_roc_auc

(0.8565729935698975, 0.8718742557751846, 0.8774112884020004)

In [24]:
# Confusion Matrix
d_cm = confusion_matrix(y_true=y_test, y_pred=d_best_clf.predict(X_test), normalize='true')
lr_cm = confusion_matrix(y_true=y_test, y_pred=lr_best_clf.predict(X_test), normalize='true')
xgb_cm = confusion_matrix(y_true=y_test, y_pred=xgb_best_clf.predict(X_test), normalize='true')

print(f"Decision Tree Confusion Matrix \n {d_cm}")
print()

print(f"Logistic Regression Confusion Matrix \n {lr_cm}")
print()

print(f"Xgboost Confusion Matrix \n {xgb_cm}")
print()

Decision Tree Confusion Matrix 
 [[0.82591093 0.17408907]
 [0.29411765 0.70588235]]

Logistic Regression Confusion Matrix 
 [[0.76315789 0.23684211]
 [0.11764706 0.88235294]]

Xgboost Confusion Matrix 
 [[0.76720648 0.23279352]
 [0.17647059 0.82352941]]



In [34]:
# Models and scores dict
model_performances = {
    "decision_tree" : {
        "model" : d_best_clf,
        "roc_auc" : d_roc_auc
    },
    "xgboost" : {
        "model" : xgb_best_clf,
        "roc_auc" : xgb_roc_auc
    },
    "logistic_regression" : {
        "model" : lr_best_clf,
        "roc_auc" : lr_roc_auc
    }
} 

best_model = sorted(model_performances.items(), reverse=True, key=lambda score: score[1]['roc_auc'])[0]

if best_model[0] == 'xgboost':
    print(f"Using bentoml xgboost framework")
else:
    print(f"Using bentoml scikit learn framework")

model = best_model[1]['model']
print(model)

Using bentoml xgboost framework
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.05,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.0500000007, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=20,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, ...)
