### Prototyping the Analysis Pipeline for the [Stroke Risk Prediction]

# !!Note!! If "Shap" does not work use "!pip install "shap" and "xgboost"" or without "!" to install it first

In [12]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import sklearn
import shap

# Dataset preperation
Uploading the csv file to prepare for 

In [None]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.to_csv("healthcare-dataset-stroke-data.csv", index=False)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Changeing the feature names for clarity and refinement for the users

In [None]:
df = df.rename(columns={
                          "id":"ID",
                          "gender": "Sex",
                          "age": "Age",
                          "hypertension": "Hypertension",
                          "heart_disease": "Heart Disease",
                          "ever_married": "Married",
                          "work_type": "Work Type",
                          "Residence_type": "Residence Type",
                          "avg_glucose_level": "Glucose",
                          "bmi": "BMI",
                          "smoking_status": "Smoking?",
                          "stroke": "Stroke"
                          })
df.head(10)

Unnamed: 0,ID,Sex,Age,Hypertension,Heart Disease,Married,Work Type,Residence Type,Glucose,BMI,Smoking?,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


Checking the data types of the dataset

In [None]:
print(df.dtypes)

ID                  int64
Sex                object
Age               float64
Hypertension        int64
Heart Disease       int64
Married            object
Work Type          object
Residence Type     object
Glucose           float64
BMI               float64
Smoking?           object
Stroke              int64
dtype: object


Certain datatypes are not valid or correct, all the "object" datatypes needs to be changed. First I grouped the features that i believe should be a certain datatype and then I convert them.

In [None]:
bool_cols = ["Hypertension", "Heart Disease", "Married", "Stroke"]
categorical_cols = ["Sex", "Residence Type", "Smoking?", "Work Type"]
numeric_cols = ["Age"]
string_cols = ["ID"]

df[bool_cols] = df[bool_cols].astype(bool)
df[categorical_cols] = df[categorical_cols].apply(lambda x: pd.Categorical(x))
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)
df[string_cols] = df[string_cols].astype(str)

print(df.dtypes)

ID                  object
Sex               category
Age                float64
Hypertension          bool
Heart Disease         bool
Married               bool
Work Type         category
Residence Type    category
Glucose            float64
BMI                float64
Smoking?          category
Stroke                bool
dtype: object


Checking the dataset if there are any missing values in every feature.

In [None]:
df.isnull().sum()

ID                  0
Sex                 0
Age                 0
Hypertension        0
Heart Disease       0
Married             0
Work Type           0
Residence Type      0
Glucose             0
BMI               201
Smoking?            0
Stroke              0
dtype: int64

Luckely, only "BMI" had missing values, changeing these values to 0 instead to make them still count.

In [None]:
df["BMI"] = df["BMI"].fillna(0)
df.isnull().sum()

ID                0
Sex               0
Age               0
Hypertension      0
Heart Disease     0
Married           0
Work Type         0
Residence Type    0
Glucose           0
BMI               0
Smoking?          0
Stroke            0
dtype: int64

In [None]:
#Checking data types
print(df.dtypes)

ID                  object
Sex               category
Age                float64
Hypertension          bool
Heart Disease         bool
Married               bool
Work Type         category
Residence Type    category
Glucose            float64
BMI                float64
Smoking?          category
Stroke                bool
dtype: object


In [None]:
df = df.drop(columns=['ID'])
print(df.dtypes)

Sex               category
Age                float64
Hypertension          bool
Heart Disease         bool
Married               bool
Work Type         category
Residence Type    category
Glucose            float64
BMI                float64
Smoking?          category
Stroke                bool
dtype: object


In [None]:
#Data processing for model trianing

#Changed bool and categorical veriables into integers with data types changes and one-hot coding
df = df.copy()

# Convert booleans to int
for col in df.select_dtypes(include="bool").columns:
    df[col] = df[col].astype(int)

# One-hot encode categorical columns
cat_cols = df.select_dtypes(include="category").columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

#Changing all features into Integers
df = df.astype(int)
df.dtypes.head(10)



Age                       int64
Hypertension              int64
Heart Disease             int64
Married                   int64
Glucose                   int64
BMI                       int64
Stroke                    int64
Sex_Male                  int64
Sex_Other                 int64
Work Type_Never_worked    int64
dtype: object

In [None]:
#Rearranged features
df = df[
    [
        'Age',
        'Hypertension',
        'Heart Disease',
        #'Married',  # removed married column
        'Glucose',
        'BMI',
        'Sex_Male',
        'Sex_Other',
        'Work Type_Never_worked',
        'Work Type_Private',
        'Work Type_Self-employed',
        'Work Type_children',
        'Residence Type_Urban',
        'Smoking?_formerly smoked',
        'Smoking?_never smoked',
        'Smoking?_smokes',
        'Stroke'
    ]
]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       5110 non-null   int64
 1   Hypertension              5110 non-null   int64
 2   Heart Disease             5110 non-null   int64
 3   Glucose                   5110 non-null   int64
 4   BMI                       5110 non-null   int64
 5   Sex_Male                  5110 non-null   int64
 6   Sex_Other                 5110 non-null   int64
 7   Work Type_Never_worked    5110 non-null   int64
 8   Work Type_Private         5110 non-null   int64
 9   Work Type_Self-employed   5110 non-null   int64
 10  Work Type_children        5110 non-null   int64
 11  Residence Type_Urban      5110 non-null   int64
 12  Smoking?_formerly smoked  5110 non-null   int64
 13  Smoking?_never smoked     5110 non-null   int64
 14  Smoking?_smokes           5110 non-null 

In [None]:
# Checking class imbalance status
df["Stroke"].value_counts()

Stroke
0    4861
1     249
Name: count, dtype: int64

In [None]:
# saving the data file
df.to_csv('clean_data_for_model_training.csv')

In [124]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Create a new file for the processed data

ID                  0
Sex                 0
Age                 0
Hypertension        0
Heart Disease       0
Married             0
Work Type           0
Residence Type      0
Glucose             0
BMI               201
Smoking?            0
Stroke              0
dtype: int64

In [125]:
#Importing models for training

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    precision_recall_curve
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE



In [126]:
# Adding random state and values
RANDOM_SEED = 42
TEST_SIZE = 0.2
CV_FOLDS = 5

In [127]:
# =========================
# 1) Load / prepare data
# =========================
df = pd.read_csv("clean_data_for_model_training.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Unnamed: 0                5110 non-null   int64
 1   Age                       5110 non-null   int64
 2   Hypertension              5110 non-null   int64
 3   Heart Disease             5110 non-null   int64
 4   Glucose                   5110 non-null   int64
 5   BMI                       5110 non-null   int64
 6   Sex_Male                  5110 non-null   int64
 7   Sex_Other                 5110 non-null   int64
 8   Work Type_Never_worked    5110 non-null   int64
 9   Work Type_Private         5110 non-null   int64
 10  Work Type_Self-employed   5110 non-null   int64
 11  Work Type_children        5110 non-null   int64
 12  Residence Type_Urban      5110 non-null   int64
 13  Smoking?_formerly smoked  5110 non-null   int64
 14  Smoking?_never smoked     5110 non-null 

In [128]:
#Droping index column
idx_cols = [c for c in df.columns if c.startswith("Unnamed")]
df = df.drop(columns=idx_cols)
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       5110 non-null   int64
 1   Hypertension              5110 non-null   int64
 2   Heart Disease             5110 non-null   int64
 3   Glucose                   5110 non-null   int64
 4   BMI                       5110 non-null   int64
 5   Sex_Male                  5110 non-null   int64
 6   Sex_Other                 5110 non-null   int64
 7   Work Type_Never_worked    5110 non-null   int64
 8   Work Type_Private         5110 non-null   int64
 9   Work Type_Self-employed   5110 non-null   int64
 10  Work Type_children        5110 non-null   int64
 11  Residence Type_Urban      5110 non-null   int64
 12  Smoking?_formerly smoked  5110 non-null   int64
 13  Smoking?_never smoked     5110 non-null   int64
 14  Smoking?_smokes           5110 non-null 

In [129]:
# 2) As the file was sorted by the target before saving, rows are shuffled
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# 3) Rebuild X, y and re-split
target = "Stroke"
X = df.drop(columns=[target])
y = df[target].astype(int)

# 0) Make sure all features are numeric and get the current column list
X = X.apply(pd.to_numeric, errors="coerce")
if X.isna().any().any():
    # if you truly have NaNs, decide how to impute; for now drop rows to keep moving
    keep = ~X.isna().any(axis=1)
    X, y = X.loc[keep].copy(), y.loc[keep].copy()

num_cols = X.columns.tolist()  # <- critical: build from the X you just made

# 1) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_SEED
)

In [130]:
#Preparing SMOTE pipelines

prep_minmax   = ColumnTransformer([("num", MinMaxScaler(),   num_cols)], remainder="drop")
prep_standard = ColumnTransformer([("num", StandardScaler(), num_cols)], remainder="drop")
prep_none     = ColumnTransformer([("num", "passthrough",    num_cols)], remainder="drop")

def smote_pipe(prep, estimator):
    return ImbPipeline(steps=[
        ("prep",  prep),
        ("smote", SMOTE(random_state=RANDOM_SEED, k_neighbors=5)),
        ("clf",   estimator)
    ])


In [131]:
# =========================
# 3) Training 10 models (with SMOTE where possible)
# =========================
MODELS_TO_TEST = {
    "DT_3":    smote_pipe(prep_none, DecisionTreeClassifier(max_depth=3, random_state=RANDOM_SEED)),
    "DT_5":    smote_pipe(prep_none, DecisionTreeClassifier(max_depth=5, random_state=RANDOM_SEED)),
    "RF_50":   smote_pipe(prep_none, RandomForestClassifier(n_estimators=50,  max_depth=5, random_state=RANDOM_SEED, n_jobs=-1)),
    "RF_150":   smote_pipe(prep_none, RandomForestClassifier(n_estimators=150,  max_depth=5, random_state=RANDOM_SEED, n_jobs=-1)),
    "SVM_lin": smote_pipe(prep_minmax, SVC(kernel="linear", probability=True, random_state=RANDOM_SEED)),
    "SVM_rbf": smote_pipe(prep_minmax, SVC(kernel="rbf",    probability=True, random_state=RANDOM_SEED)),
    "KNN_3":   smote_pipe(prep_minmax, KNeighborsClassifier(n_neighbors=3)),
    "KNN_7":   smote_pipe(prep_minmax, KNeighborsClassifier(n_neighbors=7)),
    "NB_default": smote_pipe(prep_standard, GaussianNB()),
    "NB_smooth":  smote_pipe(prep_standard, GaussianNB(var_smoothing=1e-2)),
}


In [132]:
# =========================
# 4) Cross-validated evaluation on train
# =========================
SCORING = {
    "accuracy": "accuracy",
    "precision_macro": "precision_macro",
    "recall_macro": "recall_macro",
    "f1_macro": "f1_macro",
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision",
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_SEED)

rows = []
for name, clf in MODELS_TO_TEST.items():
    print(f"The model that has been trained: {name}")
    try:
        scores = cross_validate(
            clf, X_train, y_train, cv=cv, scoring=SCORING,
            return_train_score=False, n_jobs=-1, error_score="raise"
        )
        rows.append({
            "classifier_name": name,
            "fit_time": scores["fit_time"].mean(),
            "score_time": scores["score_time"].mean(),
            "cv_acc": scores["test_accuracy"].mean(),
            "cv_prec_macro": scores["test_precision_macro"].mean(),
            "cv_rec_macro": scores["test_recall_macro"].mean(),
            "cv_f1_macro": scores["test_f1_macro"].mean(),
            "cv_roc_auc": scores["test_roc_auc"].mean(),
            "cv_pr_auc": scores["test_pr_auc"].mean(),
        })
    except Exception as e:
        print(f"❌ {name} failed: {type(e).__name__}: {e}")

cv_results = pd.DataFrame(rows).sort_values("cv_pr_auc", ascending=False).reset_index(drop=True)
print("\n=== CV summary (sorted by PR AUC) ===")
print(cv_results.round(4))

The model that has been trained: DT_3
The model that has been trained: DT_5
The model that has been trained: RF_50
The model that has been trained: RF_150
The model that has been trained: SVM_lin
The model that has been trained: SVM_rbf
The model that has been trained: KNN_3
The model that has been trained: KNN_7
The model that has been trained: NB_default
The model that has been trained: NB_smooth

=== CV summary (sorted by PR AUC) ===
  classifier_name  fit_time  score_time  cv_acc  cv_prec_macro  cv_rec_macro  \
0         SVM_lin    4.1719      0.1549  0.7566         0.5647        0.7768   
1          RF_150    0.4810      0.1724  0.8139         0.5644        0.7187   
2       NB_smooth    0.0132      0.0155  0.4584         0.5387        0.7011   
3           RF_50    0.1957      0.1062  0.8170         0.5596        0.6967   
4      NB_default    0.0138      0.0159  0.1923         0.5264        0.5707   
5         SVM_rbf    4.5313      0.3593  0.8097         0.5515        0.6712   

In [None]:
# Save the results
cv_results.to_csv("cv_results_with_smote.csv", index=False)
test_results.to_csv("test_results_with_smote.csv", index=False)

# As per the results table and the aim of the model dashboard to predict stroke cases, the model with the highest recall and relatively high other matrics, SVM_liner model is selected as the final model

In [148]:
best = MODELS_TO_TEST["SVM_lin"].fit(X_train, y_train)
y_prob = best.predict_proba(X_test)[:,1]

from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_test, y_prob)
f1 = 2*(prec*rec)/(prec+rec+1e-9)
thr_star = thr[max(0, f1.argmax()-1)]

from sklearn.metrics import classification_report, average_precision_score, roc_auc_score
y_pred = (y_prob >= thr_star).astype(int)
print("PR AUC:", average_precision_score(y_test, y_prob))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Threshold:", thr_star)
print(classification_report(y_test, y_pred, digits=4))

PR AUC: 0.17015005995900492
ROC AUC: 0.7739506172839506
Threshold: 0.5
              precision    recall  f1-score   support

           0     0.9785    0.7500    0.8492       972
           1     0.1227    0.6800    0.2080        50

    accuracy                         0.7466      1022
   macro avg     0.5506    0.7150    0.5286      1022
weighted avg     0.9367    0.7466    0.8178      1022



--------------------------------------------------------------------------------------------------------------------

## Exporting a pretrained model

In the dashboard, you should load a pre-trained model that was designed and evaluated in the jupyter notebook. You can do it with `pickle` or any alternative for the same purpose.

In [149]:
import pickle

# === Save model ===
# Folder path to where your Streamlit or dashboard can access it
file_path = "../assets/trained_model_final.pickle"

# 'pipe' is your trained pipeline from the final SVM code
data_to_save = best
# Create a binary object and save the trained model
with open(file_path, "wb") as writeFile:
    pickle.dump(data_to_save, writeFile)

print(f"Model saved successfully to {file_path}")

Model saved successfully to ../assets/trained_model_final.pickle


In [150]:
# Load model
pre_trained_model_path = "../assets/trained_model_final.pickle"

# Load the model
with open(pre_trained_model_path, "rb") as readFile:
    loaded_model = pickle.load(readFile)

print("Model loaded successfully!")

Model loaded successfully!


In [151]:
loaded_model

0,1,2
,steps,"[('prep', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,
