In [1]:
import pandas as pd
import numpy as np
data= pd.read_csv('healthcare-dataset-stroke-data.csv')

to represent the data we call "data" because "data.describe()" would only take in account numerical columns

In [2]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
corr=data.corr()
corr["stroke"].sort_values(ascending=False)

stroke               1.000000
age                  0.245257
heart_disease        0.134914
avg_glucose_level    0.131945
hypertension         0.127904
bmi                  0.042374
id                   0.006388
Name: stroke, dtype: float64

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
train_set["stroke"].value_counts()[1]/train_set["stroke"].value_counts()[0]

0.047936426557293

In [7]:
test_set["stroke"].value_counts()[1]/test_set["stroke"].value_counts()[0]

0.06458333333333334

however a stratified split according to the age parameter could be better

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in split.split(data, data["age"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [9]:
strat_train_set["stroke"].value_counts()[1]/strat_train_set["stroke"].value_counts()[0]

0.051440329218107

In [10]:
strat_test_set["stroke"].value_counts()[1]/strat_test_set["stroke"].value_counts()[0]

0.050359712230215826

seems better than without 

# Data preparation
drop the "stroke" variable and then check for null values

In [11]:
train_X = strat_train_set.drop("stroke", axis=1) 
train_y = strat_train_set["stroke"].copy()
val_X= strat_test_set.drop("stroke", axis=1)
val_y= strat_test_set["stroke"].copy()

for col in train_X.columns: 
    if train_X[col].isnull().any():
        print(col)

bmi


so bmi is the only variable to be imputed

In [25]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

X_train_num = train_X.drop(["gender","ever_married","work_type", "Residence_type","smoking_status"], axis=1)
X_val_num = val_X.drop(["gender","ever_married","work_type", "Residence_type","smoking_status"], axis=1)

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_num))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_val_num))

imputed_X_train.columns = X_train_num.columns 
imputed_X_valid.columns = X_val_num.columns

imputed_X_train.index = X_train_num.index 
imputed_X_valid.index = X_val_num.index

now we deal with categorical features, first look at differen vaues in categorical features

In [27]:
train_cat_X=strat_train_set[["gender","ever_married","work_type", "Residence_type","smoking_status"]]
val_cat_X=strat_test_set[["gender","ever_married","work_type", "Residence_type","smoking_status"]]
for col in train_cat_X: 
    print(train_cat_X[col].value_counts())

Female    2408
Male      1679
Other        1
Name: gender, dtype: int64
Yes    2687
No     1401
Name: ever_married, dtype: int64
Private          2330
Self-employed     681
children          543
Govt_job          515
Never_worked       19
Name: work_type, dtype: int64
Urban    2071
Rural    2017
Name: Residence_type, dtype: int64
never smoked       1521
Unknown            1231
formerly smoked     717
smokes              619
Name: smoking_status, dtype: int64


We will use a binary variable for "ever_married" and  "Residence_type"

In [28]:
train_cat_X = train_cat_X.assign(ever_married=(data["ever_married"] == 'Yes').astype(int))
train_cat_X = train_cat_X.assign(Residence_type=(data["Residence_type"] == 'Urban').astype(int))

val_cat_X=val_cat_X.assign(ever_married=(data["ever_married"] == 'Yes').astype(int))
val_cat_X=val_cat_X.assign(Residence_type=(data["Residence_type"] == 'Urban').astype(int))

train_cat_X

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
2284,Male,1,Self-employed,0,never smoked
3772,Male,1,Self-employed,0,formerly smoked
3628,Female,1,Private,1,never smoked
1467,Male,0,children,1,Unknown
1624,Male,1,Private,1,never smoked
...,...,...,...,...,...
4225,Female,1,Private,0,Unknown
2844,Female,1,Govt_job,0,smokes
1876,Female,1,Self-employed,0,never smoked
1943,Male,0,Private,1,formerly smoked


now use ordinal econder for "smoking_status"

In [29]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_cat_X["smoking_status"] = label_encoder.fit_transform(train_cat_X["smoking_status"])
val_cat_X["smoking_status"] = label_encoder.transform(val_cat_X["smoking_status"])

Use one-hot enconding for the others

In [31]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train=train_cat_X.drop(["ever_married", "Residence_type","smoking_status"],axis=1)
OH_cols_valid=val_cat_X.drop(["ever_married", "Residence_type","smoking_status"],axis=1)

for col in ["gender","work_type"]:
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(OH_cols_train))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(OH_cols_valid))

    # One-hot removed index;
    OH_cols_train.index = train_cat_X.index
    OH_cols_valid.index = val_cat_X.index
    
OH_cols_train   

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
2284,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3772,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3628,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1467,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1624,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2844,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1876,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1943,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


Now get back all tables into unique one 

In [32]:
cat_train_col=pd.concat([train_cat_X, OH_cols_train], axis=1)
cat_val_col=pd.concat([val_cat_X, OH_cols_valid], axis=1)

cat_train_col=cat_train_col.drop(["gender","work_type"],axis=1)
cat_val_col=cat_val_col.drop(["gender","work_type"],axis=1)

processed_X_train=pd.concat([cat_train_col, imputed_X_train], axis=1)
processed_X_val=pd.concat([cat_val_col, imputed_X_valid], axis=1)

processed_X_train

Unnamed: 0,ever_married,Residence_type,smoking_status,0,1,2,3,4,5,6,...,12,13,14,15,id,age,hypertension,heart_disease,avg_glucose_level,bmi
2284,1,0,2,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,63990.0,52.0,1.0,0.0,192.37,49.2
3772,1,0,1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,61827.0,80.0,0.0,0.0,196.08,31.0
3628,1,1,2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,72398.0,73.0,1.0,0.0,110.38,26.3
1467,0,1,0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,47627.0,8.0,0.0,0.0,107.69,20.3
1624,1,1,2,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,60973.0,51.0,0.0,0.0,66.11,26.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,1,0,0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,14658.0,37.0,0.0,0.0,77.10,55.9
2844,1,0,3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,57944.0,35.0,0.0,0.0,56.12,24.2
1876,1,0,2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,57618.0,47.0,0.0,0.0,140.39,25.5
1943,0,1,1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,4627.0,34.0,0.0,0.0,69.09,36.9


check if there are double ID, so we know if the patient has occurred twice

In [33]:
data["id"].value_counts()

16380    1
58061    1
69918    1
23238    1
4807     1
        ..
35085    1
44375    1
5464     1
50522    1
49152    1
Name: id, Length: 5110, dtype: int64

Since every patient has occourred once we can now drop the id colum because it is useless

In [35]:
processed_X_train=processed_X_train.drop("id",axis=1)
processed_X_val=processed_X_val.drop("id",axis=1)

Now we have to standardize


VERY IMPORTANT: for time resaon I didn't check for outliers but it would be very important

In [36]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

XX_train = pd.DataFrame(scaler.fit_transform(processed_X_train))
XX_valid = pd.DataFrame(scaler.transform(processed_X_val))

XX_train.columns = processed_X_train.columns 
XX_valid.columns = processed_X_val.columns

XX_train.index = processed_X_train.index 
XX_valid.index = processed_X_val.index

XX_train

Unnamed: 0,ever_married,Residence_type,smoking_status,0,1,2,3,4,5,6,...,11,12,13,14,15,age,hypertension,heart_disease,avg_glucose_level,bmi
2284,1.0,0.0,0.666667,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.633789,1.0,0.0,0.633598,0.445590
3772,1.0,0.0,0.333333,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.975586,0.0,0.0,0.650725,0.237113
3628,1.0,1.0,0.666667,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.890137,1.0,0.0,0.255101,0.183276
1467,0.0,1.0,0.000000,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.096680,0.0,0.0,0.242683,0.114548
1624,1.0,1.0,0.666667,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.621582,0.0,0.0,0.050734,0.183276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,1.0,0.0,0.000000,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.450684,0.0,0.0,0.101468,0.522337
2844,1.0,0.0,1.000000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.426270,0.0,0.0,0.004616,0.159221
1876,1.0,0.0,0.666667,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.572754,0.0,0.0,0.393639,0.174112
1943,0.0,1.0,0.333333,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.414062,0.0,0.0,0.064491,0.304696


Other things that could be done: 
1. create new features
2. look for the best correlations
3. VERY IMPORTANT: check for outliers before using minmaxscaler because they affect a lot!
4. drop useless things ( like ID or the only one with gender "other")

## Model fitting and score


We can look for different models and how they perform, we will do so with a for loop  

In [42]:
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = dict()
models['Logistic Regression']= LogisticRegression()
models['Kernel SVM'] =SVC()
models['KNN']= KNeighborsClassifier()
models['Gaussian NB']= GaussianNB()
models['Decision Tree Classifier']=DecisionTreeClassifier()
models['Random Forest']= RandomForestClassifier()
models['XGBClassifier']= XGBClassifier()

for model in models:
    models[model].fit(XX_train,train_y)
    print(model+":"+str(metrics.accuracy_score(val_y, models[model].predict(XX_valid))))

Logistic Regression:0.952054794520548
Kernel SVM:0.952054794520548
KNN:0.952054794520548
Gaussian NB:0.1917808219178082
Decision Tree Classifier:0.9187866927592955
Random Forest:0.9461839530332681
XGBClassifier:0.9481409001956947


KNN, Logistic Regression and Kernel SVM seems to have performed the best so we can pick one of them as our final model.

Indeed now we have a well functioning Mchine Learning Classifier model, but we could get to even better results by tuning hyperparameters of the best models.