In [1]:
%config InlineBackend.figure_format = "svg"

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os

The best models have been selected and trained in the notebook `2-model.ipynb`. Together with the data pipeline, they have been saved in the `models` directory. This notebook will load these trained pipelines and evaluate them on the test data according to various metrics.

## Loading the test data

In [2]:
datapath = "data/healthcare-dataset-stroke-data-test.csv"
df = pd.read_csv(datapath)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.00,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,69.00,0,0,No,Private,Urban,94.39,22.8,never smoked,1
2,Female,52.00,1,0,Yes,Self-employed,Urban,233.29,48.9,never smoked,1
3,Male,71.00,0,0,Yes,Private,Urban,102.87,27.2,formerly smoked,1
4,Male,80.00,0,0,Yes,Self-employed,Rural,104.12,23.5,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
1017,Female,45.00,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
1018,Male,1.08,0,0,No,children,Rural,79.15,17.4,Unknown,0
1019,Male,57.00,0,0,Yes,Govt_job,Rural,76.62,28.2,never smoked,0
1020,Female,45.00,0,0,Yes,Private,Urban,97.95,24.5,Unknown,0


In [3]:
# Split dataframe intro features and labels
features = [col for col in df.columns if col != "stroke"]
X_test, y_test = df[features], df["stroke"]

## Loading the training pipelines

In [4]:
# Placeholder for the trained models
models = {}

# Iterate all trained models
for filename in os.listdir("models"):
    
    # Get model name
    name = filename.split(".")[0]
    
    # Load model with pickle
    with open(f"models/{filename}", "rb") as f:
        model = pickle.load(f)
        
    # Save into models
    models[name] = model
    
models

{'svc': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num_preprocessor',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer()),
                                                                   ('scaler',
                                                                    MinMaxScaler())]),
                                                   ['age', 'hypertension',
                                                    'heart_disease',
                                                    'avg_glucose_level',
                                                    'bmi']),
                                                  ('cat_preprocessor',
                                                   ColumnTransformer(transformers=[('onehot',
                                                                                    OneHotEncoder(handle_unknown='i

## Make predictions with the models

In [5]:
# Placeholder for model predictions
y_preds = {name: model.predict_proba(X_test) for name, model in models.items()}

TODO:
- Accuracy
- ROC Curve and AUC
- Confusion Matrix
- Precision, Recall, F1
- pred vs true distributions