In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score
from sklearn.preprocessing import OneHotEncoder

# imblearn imports
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  

## Import dataset using pandas

In [28]:
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')
dataset = dataset.dropna()
dataset = dataset.drop(['id'],axis=1)
display(dataset.head())

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


## Returning counts of labels

In [29]:
dataset['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

Data is massively underbalanced. We will use random over and under sampling and see which technique leads to better performance

## Train-Test Split

In [30]:
from collections import Counter


target = dataset["stroke"]
X = dataset.drop(['stroke'],axis=1)

ros = RandomUnderSampler()
# resampling X, y
dataset, target = ros.fit_resample(X, target)
# new class distribution 
print(Counter(target))
print(type(dataset))

dataset = pd.concat([dataset,target],axis=1)

train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('stroke', axis=1)
x_test = test_dataset.drop('stroke', axis=1)

Counter({0: 209, 1: 209})
<class 'pandas.core.frame.DataFrame'>


## DICE object

In [31]:
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['age','avg_glucose_level','bmi'], outcome_name='stroke')

## Pipeline

In [32]:
numerical = ['age','avg_glucose_level','bmi']
categorical = x_train.columns.difference(numerical)

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.

clf_rf = Pipeline(steps=[('preprocessor', transformations),
                      ('clf', RandomForestClassifier(random_state=42))])

clf_svm = Pipeline(steps=[('preprocessor', transformations),
                      ('clf', SVC(random_state=42))])

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
                   'clf__max_features': ['auto', 'sqrt'],
                   'clf__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 384, num = 32)],
                   'clf__min_samples_split': [2, 5, 10],
                   'clf__max_depth': [int(x) for x in np.linspace(start = 5, stop = 8, num = 1)]}]

grid_params_svm = [{'clf__kernel': ['poly', 'rbf','sigmoid'], 
        'clf__C': [0.1, 1, 10, 100, 1000],
        'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]}]

jobs = -1

RF = GridSearchCV(estimator=clf_rf,
            param_grid=grid_params_rf,
            scoring='roc_auc',
            cv=10, 
            n_jobs=jobs)


SVM = GridSearchCV(estimator=clf_svm,
            param_grid=grid_params_svm,
            scoring='roc_auc',
            cv=10,
            n_jobs=jobs)

grids = [RF,SVM]

## Training models using Pipeline & GridSearchCV

In [33]:
# Creating a dict for our reference
grid_dict = {0: 'Random Forest', 
        1: 'Support Vector Machine'}

best_rf = {'score': 0, 'best_model':0}
best_svm = {'score': 0, 'best_model':0}

# Fit the grid search objects
print('Performing model optimizations...')

for idx, model in enumerate(grids):
    
    print('\nEstimator: %s' % grid_dict[idx])
    model.fit(x_train, y_train)
    print('Best params are : %s' % model.best_params_)
    
    # Best training data accuracy
    print('Best training accuracy: %.3f' % model.best_score_)
    
    # Predict on test data with best params
    y_pred = model.predict(x_test)
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    print('Precision score: {}'.format(precision_score(y_test, y_pred)))
    print('Recall score: {}'.format(recall_score(y_test, y_pred)))
    print('F1 score: {}'.format(f1_score(y_test, y_pred)))
    print('AUC-ROC score: {}'.format(roc_auc_score(y_test, y_pred)))
    
    # Track best (highest test accuracy) model
    if idx == 0:
        if roc_auc_score(y_test, y_pred) > best_rf['score']:
            best_rf['score'] = roc_auc_score(y_test, y_pred)
            best_rf['best_model'] = model.best_params_ 
    
    elif idx == 1:
        if roc_auc_score(y_test, y_pred) > best_svm['score']:
            best_svm['score'] = roc_auc_score(y_test, y_pred)
            best_svm['best_model'] = model.best_params_
            
# save dict to file
import json

with open('best_stroke_rf_params.txt', 'w') as file:
     file.write(json.dumps(best_rf)) # use `json.loads` to do the reverse
        
with open('best_stroke_svm_params.txt', 'w') as file:
     file.write(json.dumps(best_svm)) # use `json.loads` to do the reverse

Performing model optimizations...

Estimator: Random Forest
Best params are : {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__max_features': 'auto', 'clf__min_samples_split': 10, 'clf__n_estimators': 161}
Best training accuracy: 0.679
Test set accuracy score for best params: 0.774 
Precision score: 0.7254901960784313
Recall score: 0.8809523809523809
F1 score: 0.7956989247311828
AUC-ROC score: 0.773809523809524

Estimator: Support Vector Machine
Best params are : {'clf__C': 0.1, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
Best training accuracy: 0.647
Test set accuracy score for best params: 0.738 
Precision score: 0.6785714285714286
Recall score: 0.9047619047619048
F1 score: 0.7755102040816326
AUC-ROC score: 0.738095238095238
