In [1]:
# Imports for data processing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline

# Imports for ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Imports for evaluation metrics
from sklearn.metrics import roc_auc_score, accuracy_score

# Other imports
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys

# Ensure reproducibility
random_state = 0

## Import data and perform train and validation split

In [2]:
# Import data
datapath = "data/healthcare-dataset-stroke-data-train.csv"
df = pd.read_csv(datapath)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,82.0,0,1,Yes,Private,Urban,144.90,26.4,smokes,1
1,Male,4.0,0,0,No,children,Rural,106.22,16.7,Unknown,0
2,Male,58.0,0,0,Yes,Private,Urban,79.95,25.9,never smoked,0
3,Female,20.0,0,0,No,Private,Rural,96.57,34.1,never smoked,0
4,Female,10.0,0,0,No,children,Rural,69.84,13.7,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
4083,Female,51.0,0,0,Yes,Self-employed,Urban,232.89,34.0,smokes,0
4084,Male,64.0,0,1,Yes,Private,Urban,191.61,37.5,smokes,1
4085,Male,37.0,0,0,Yes,Self-employed,Rural,82.43,39.1,Unknown,0
4086,Female,22.0,0,0,No,Private,Rural,62.00,32.7,smokes,0


In [3]:
# Split the data into features and label
features = [col for col in list(df.columns) if col!="stroke"]
X, y = df[features], df["stroke"]

## Create the data preprocessor

Create the data preprocessing steps according to the EDA performed in file `1-eda.ipynb`:
- Impute missing values with the mean
- Perform one hot encoding on categorical columns

The EDA, however, misses one important step, which is the scaling of the data. It was not discussed in EDA because it actually makes it harder to understand the data. However, since we are using models that evaluate distances between datapoints, it is very important to scale our data.

Since the models we will be using do not assume normally distributed data, all features will be normalized in the [0, 1] range.

To summarize, the following steps are performed for preprocessing the data:
- Impute missing values with mean for the numerical columns
- One hot encoding for categorical columns
- MinMaxNormalization for every column

In [5]:
# Create the mean imputer and feature scaler
num_features = [col for col in features if df[col].dtype in ["int64", "float64"]]
imputer = SimpleImputer()
scaler = MinMaxScaler()

# Create numerical preprocessor
num_preprocessor = Pipeline(steps=[
    ("imputer", imputer),
    ("scaler", scaler)
])

# Create the one-hot encoder
handle_unknown = "error"
one_hot_features = ["work_type", "smoking_status"]
one_hot_encoder = OneHotEncoder(handle_unknown=handle_unknown)

# Create ordinal encoder
cat_features = [col for col in features if col not in num_features]
ordinal_features = [col for col in cat_features if col not in one_hot_features]
ordinal_encoder = OrdinalEncoder()

# Create categorical preprocessor
cat_preprocessor = ColumnTransformer(transformers=[
    ("onehot", one_hot_encoder, one_hot_features),
    ("ordinal", ordinal_encoder, ordinal_features)
])

# Create the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num_preprocessor", num_preprocessor, num_features),
    ("cat_preprocessor", cat_preprocessor, cat_features)
])

Finally, before going into the model selection phase, it is important to recall that the used dataset is imbalanced. More specifically, there are a lot fewer datapoints of patients who actually had a stroke. 

To compensate for their smaller frequency, we can increase their importance by giving them more weight. The following code cell calculates the class weights for the training data.

In [6]:
def get_class_weights(X, y):
    # Create the class weights due to data imbalance
    largest_class_size = max([X[y==c].shape[0] for c in y.unique()])
    class_weights = {c: largest_class_size/X[y==c].shape[0] for c in y.unique()}
    return class_weights
    
get_class_weights(X, y)

{1: 20.181347150259068, 0: 1.0}

However, some models are can't handle this type of argument to produce their results. For a more uniform approach across all model, an over-sampling technique, `SMOTE`, is applied using the `imblearn` package. This technique generates new datapoints for the minority class to balance the dataset.

In [7]:
# Create dataset balancer
resampler = SMOTE(random_state=random_state)

In [8]:
# Check that SMOTE balances the dataset
# NOTE: SMOTE here is applied to all data, later the data is split into training and validation, only training data should use SMOTE
X_res, y_res = resampler.fit_resample(preprocessor.fit(X, y).transform(X), y)
get_class_weights(X_res, y_res)

{1: 1.0, 0: 1.0}

As we can see from the class weights, SMOTE does indeed balance our dataset!

## Model selection

To perform model selection, a few models were considered to solve this binary classification problem:
- Logistic regression classifier
- Support vector classifier
- Random forest classifier

They are all fitted to the training data according to the class weights and later evaluated on the validation dataset using the area under the ROC curve.

In [9]:
# Create model pool to select from
model_pool = [
    {"name": "LR", "model": LogisticRegression(random_state=random_state), "train_score": 0, "val_score": 0},
    {"name": "SVM", "model": SVC(random_state=random_state), "train_score": 0, "val_score": 0},
    {"name": "RFC", "model": RandomForestClassifier(random_state=random_state), "train_score": 0, "val_score": 0},
    {"name": "KNN", "model": KNeighborsClassifier(), "train_score": 0, "val_score": 0},
    {"name": "DTC", "model": DecisionTreeClassifier(random_state=random_state), "train_score": 0, "val_score": 0},
    {"name": "GNB", "model": GaussianNB(), "train_score": 0, "val_score": 0},
    {"name": "XGB", "model": XGBClassifier(), "train_score": 0, "val_score": 0},
    {"name": "NN", "model": MLPClassifier(random_state=random_state, max_iter=500, early_stopping=True), "train_score": 0, "val_score": 0}
]

# Define the metric function and number of kfold splits
metric = roc_auc_score
n_splits = 5

# Create folds for cross-validation
kf = KFold(n_splits=n_splits, random_state=random_state)

# Iterate each model in the pool
model_bar = tqdm(model_pool, total=len(model_pool), desc="Iterating models", position=0, file=sys.stdout)
for model_dict in model_bar:
    
    # Get dictionary values
    name = model_dict["name"]
    model = model_dict["model"]
    
    # Define the training pipeline using the model
    train_pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("resampler", resampler),
        ("model", model)
    ])
    
    # Iterate folds for cross-validation
    kfold_bar = tqdm(enumerate(kf.split(X)), total=n_splits, desc="Cross-validating", position=1, file=sys.stdout)
    for fold, (train_idx, val_idx) in kfold_bar:
        
        # Define training and validation sets
        X_train, X_val = X.loc[train_idx], X.loc[val_idx]
        y_train, y_val = y.loc[train_idx], y.loc[val_idx]
        
        # Fit pipeline to training data
        train_pipe = train_pipe.fit(X_train, y_train)
        
        # Get trained preprocessor and model
        trained_preprocessor = train_pipe.get_params()["preprocessor"]
        trained_model = train_pipe.get_params()["model"]
        
        # Save train data score
        y_train_pred = train_pipe.predict(X_train)
        score_train = roc_auc_score(y_train, y_train_pred)
        model_dict["train_score"] += score_train / n_splits
        
        # Define validation pipeline (remove SMOTE)
        val_pipe = Pipeline(steps=[
            ("preprocessor", trained_preprocessor),
            ("model", trained_model)
        ])
        
        # Save validation data score
        y_val_pred = val_pipe.predict(X_val)
        score_val = roc_auc_score(y_val, y_val_pred)
        model_dict["val_score"] += score_val / n_splits
        
    # Print final results
    score_train = model_dict["train_score"]
    score_val = model_dict["val_score"]
    tqdm.write(f"Model: {name} | Train score: {score_train:.3f} | Val score: {score_val:.3f}")

Cross-validating: 100%|██████████| 5/5 [00:01<00:00,  3.87it/s]
Model: LR | Train score: 0.776 | Val score: 0.759      
Cross-validating: 100%|██████████| 5/5 [00:04<00:00,  1.24it/s]
Model: SVM | Train score: 0.829 | Val score: 0.678             
Cross-validating: 100%|██████████| 5/5 [00:02<00:00,  2.09it/s]
Model: RFC | Train score: 1.000 | Val score: 0.556             
Cross-validating: 100%|██████████| 5/5 [00:00<00:00,  5.32it/s]
Model: KNN | Train score: 0.937 | Val score: 0.588             
Cross-validating: 100%|██████████| 5/5 [00:00<00:00, 18.58it/s]
Model: DTC | Train score: 1.000 | Val score: 0.572             
Cross-validating: 100%|██████████| 5/5 [00:00<00:00, 36.24it/s]
Model: GNB | Train score: 0.649 | Val score: 0.647             
Cross-validating: 100%|██████████| 5/5 [00:02<00:00,  2.37it/s]
Model: XGB | Train score: 0.983 | Val score: 0.554             
Cross-validating: 100%|██████████| 5/5 [00:11<00:00,  2.25s/it]
Model: NN | Train score: 0.826 | Val score: 0.67

As we can see from these trained models, the one that faired better was the Logistic Regressor. However, both the Support Vector Machine classifier and the Neural Network also performed quite well. The Neural Network is especially promising, as it is very likely that performing hyperparameter tuning will boost its performance quite significantly.