In [54]:
# Import pandas with the usual alias

import pandas as pd

In [55]:
# Load stroke.csv into a DataFrame
# Source: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

raw_df = pd.read_csv("stroke.csv")

In [56]:
# Show the first five rows of the DataFrame

raw_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [57]:
# Drop the nominal "id" feature

id_less_df = raw_df.drop(columns=["id"])
id_less_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [58]:
# Show summary statistics for the numeric columns

id_less_df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [59]:
# Show null counts for each feature
# Note that Pandas converts the string "N/A" to a NaN

id_less_df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [60]:
# Replace nulls in a feature-appropriate way

import numpy as np
from sklearn.impute import SimpleImputer

non_null_df = id_less_df.copy()
non_null_df[["bmi"]] = SimpleImputer(missing_values=np.nan, strategy="mean").fit_transform(non_null_df[["bmi"]])
assert non_null_df.isnull().sum().sum() == 0
non_null_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [61]:
# Show unique values for categorical features
# Are there typos that actually denote the same value?

CATEGORICAL_FEATURES = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for feature in CATEGORICAL_FEATURES:
    print(feature, non_null_df[feature].unique())

gender ['Male' 'Female' 'Other']
ever_married ['Yes' 'No']
work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type ['Urban' 'Rural']
smoking_status ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [62]:
# One-hot encode categorical features

one_hot_encoded_df = pd.get_dummies(non_null_df, columns=CATEGORICAL_FEATURES)
one_hot_encoded_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,28.893237,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0


In [63]:
# Standardize continuous features

from sklearn.preprocessing import StandardScaler

CONTINUOUS_FEATURES = ["age", "avg_glucose_level", "bmi"]
standardized_df = one_hot_encoded_df.copy()
standardized_df[CONTINUOUS_FEATURES] = StandardScaler().fit_transform(standardized_df[CONTINUOUS_FEATURES])
standardized_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.051434,0,1,2.706375,1.001234,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1,0.78607,0,0,2.121559,4.615554e-16,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
2,1.62639,0,1,-0.005028,0.4685773,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,0.255342,0,0,1.437358,0.7154182,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1.582163,1,0,1.501184,-0.6357112,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0


In [64]:
# Extra: instead of standardizing continuous features, discretize and then binarize them

binary_df = non_null_df.copy()
for feature in CONTINUOUS_FEATURES:
    binary_df[feature] = pd.cut(binary_df[feature], bins=4)
binary_df = pd.get_dummies(binary_df)
binary_df.head()

Unnamed: 0,hypertension,heart_disease,stroke,gender_Female,gender_Male,gender_Other,"age_(-0.00192, 20.56]","age_(20.56, 41.04]","age_(41.04, 61.52]","age_(61.52, 82.0]",...,"avg_glucose_level_(163.43, 217.585]","avg_glucose_level_(217.585, 271.74]","bmi_(10.213, 32.125]","bmi_(32.125, 53.95]","bmi_(53.95, 75.775]","bmi_(75.775, 97.6]",smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,1,0,1,0,0,0,0,1,...,0,1,0,1,0,0,0,1,0,0
1,0,0,1,1,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
2,0,1,1,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
3,0,0,1,1,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
4,1,0,1,1,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
