In [1698]:
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import os 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [1699]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [1700]:
val = df.mean(numeric_only=True)
val

id                   36517.829354
age                     43.226614
hypertension             0.097456
heart_disease            0.054012
avg_glucose_level      106.147677
bmi                     28.893237
stroke                   0.048728
dtype: float64

In [1701]:
df.fillna(val, inplace=True)
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

# Pre-processing

In [1702]:
df['gender'] = df['gender'].map({"Male":1, "Female": 0, "Other": 2})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,0,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,1,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,0,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,0,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [1703]:
df['ever_married'] = df['ever_married'].map({"Yes":1, "No": 0})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,0,61.0,0,0,1,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,1,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked,1
3,60182,0,49.0,0,0,1,Private,Urban,171.23,34.4,smokes,1
4,1665,0,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked,1


In [1704]:
df["Residence_type"] = df["Residence_type"].map({"Urban":1, "Rural": 0})

In [1705]:
work = df["work_type"].values.reshape(-1,1)
work

array([['Private'],
       ['Self-employed'],
       ['Private'],
       ...,
       ['Self-employed'],
       ['Private'],
       ['Govt_job']], dtype=object)

In [1706]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
work = onehotencoder.fit_transform(work).toarray()
work

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [1707]:
onehotencoder.categories_

[array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
       dtype=object)]

In [1708]:
categories = onehotencoder.categories_[0]
categories

array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
      dtype=object)

In [1709]:
df_ohe = pd.DataFrame(work, columns=categories)
df_ohe.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [1710]:
df = pd.concat([df_ohe, df], axis=1)
df.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0.0,0.0,1.0,0.0,0.0,9046,1,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,0.0,0.0,0.0,1.0,0.0,51676,0,61.0,0,0,1,Self-employed,0,202.21,28.893237,never smoked,1
2,0.0,0.0,1.0,0.0,0.0,31112,1,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
3,0.0,0.0,1.0,0.0,0.0,60182,0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
4,0.0,0.0,0.0,1.0,0.0,1665,0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1


In [1711]:
df.groupby("smoking_status").size()

smoking_status
Unknown            1544
formerly smoked     885
never smoked       1892
smokes              789
dtype: int64

In [1712]:
df = pd.get_dummies(df, columns=["smoking_status"], prefix_sep='_')

In [1713]:
df.columns = df.columns.str.replace(" ", "_")

In [1714]:
df.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes
0,0.0,0.0,1.0,0.0,0.0,9046,1,67.0,0,1,1,Private,1,228.69,36.6,1,False,True,False,False
1,0.0,0.0,0.0,1.0,0.0,51676,0,61.0,0,0,1,Self-employed,0,202.21,28.893237,1,False,False,True,False
2,0.0,0.0,1.0,0.0,0.0,31112,1,80.0,0,1,1,Private,0,105.92,32.5,1,False,False,True,False
3,0.0,0.0,1.0,0.0,0.0,60182,0,49.0,0,0,1,Private,1,171.23,34.4,1,False,False,False,True
4,0.0,0.0,0.0,1.0,0.0,1665,0,79.0,1,0,1,Self-employed,0,174.12,24.0,1,False,False,True,False


# Scaling

In [1715]:
age = df.pop('age')

In [1716]:
gluc = df.pop('avg_glucose_level')

In [1717]:
bmi = df.pop('bmi')

In [1718]:
df_num = pd.concat([age, gluc, bmi], axis=1)
df_num.head()

Unnamed: 0,age,avg_glucose_level,bmi
0,67.0,228.69,36.6
1,61.0,202.21,28.893237
2,80.0,105.92,32.5
3,49.0,171.23,34.4
4,79.0,174.12,24.0


In [1719]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

df_num_scaled = sc.fit_transform(df_num)
df_num_scaled = pd.DataFrame(df_num_scaled, columns=df_num.columns)

In [1720]:
df_num_scaled.head(5)

Unnamed: 0,age,avg_glucose_level,bmi
0,1.051434,2.706375,1.001234
1,0.78607,2.121559,4.615554e-16
2,1.62639,-0.005028,0.4685773
3,0.255342,1.437358,0.7154182
4,1.582163,1.501184,-0.6357112


In [1721]:
df1 = pd.concat([df, df_num_scaled], axis=1)
df1.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,stroke,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,avg_glucose_level,bmi
0,0.0,0.0,1.0,0.0,0.0,9046,1,0,1,1,Private,1,1,False,True,False,False,1.051434,2.706375,1.001234
1,0.0,0.0,0.0,1.0,0.0,51676,0,0,0,1,Self-employed,0,1,False,False,True,False,0.78607,2.121559,4.615554e-16
2,0.0,0.0,1.0,0.0,0.0,31112,1,0,1,1,Private,0,1,False,False,True,False,1.62639,-0.005028,0.4685773
3,0.0,0.0,1.0,0.0,0.0,60182,0,0,0,1,Private,1,1,False,False,False,True,0.255342,1.437358,0.7154182
4,0.0,0.0,0.0,1.0,0.0,1665,0,1,0,1,Self-employed,0,1,False,False,True,False,1.582163,1.501184,-0.6357112


In [1722]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Govt_job                        5110 non-null   float64
 1   Never_worked                    5110 non-null   float64
 2   Private                         5110 non-null   float64
 3   Self-employed                   5110 non-null   float64
 4   children                        5110 non-null   float64
 5   id                              5110 non-null   int64  
 6   gender                          5110 non-null   int64  
 7   hypertension                    5110 non-null   int64  
 8   heart_disease                   5110 non-null   int64  
 9   ever_married                    5110 non-null   int64  
 10  work_type                       5110 non-null   object 
 11  Residence_type                  5110 non-null   int64  
 12  stroke                          51

In [1723]:
df1.isna().sum()

Govt_job                          0
Never_worked                      0
Private                           0
Self-employed                     0
children                          0
id                                0
gender                            0
hypertension                      0
heart_disease                     0
ever_married                      0
work_type                         0
Residence_type                    0
stroke                            0
smoking_status_Unknown            0
smoking_status_formerly_smoked    0
smoking_status_never_smoked       0
smoking_status_smokes             0
age                               0
avg_glucose_level                 0
bmi                               0
dtype: int64

In [1724]:
df_fin = df1.drop(['id', 'work_type'], axis=1)

In [1725]:
df_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Govt_job                        5110 non-null   float64
 1   Never_worked                    5110 non-null   float64
 2   Private                         5110 non-null   float64
 3   Self-employed                   5110 non-null   float64
 4   children                        5110 non-null   float64
 5   gender                          5110 non-null   int64  
 6   hypertension                    5110 non-null   int64  
 7   heart_disease                   5110 non-null   int64  
 8   ever_married                    5110 non-null   int64  
 9   Residence_type                  5110 non-null   int64  
 10  stroke                          5110 non-null   int64  
 11  smoking_status_Unknown          5110 non-null   bool   
 12  smoking_status_formerly_smoked  51

In [1726]:
col = df_fin.pop('stroke')
df_fin.insert(17, "stroke", col)
df_fin.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,gender,hypertension,heart_disease,ever_married,Residence_type,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,avg_glucose_level,bmi,stroke
0,0.0,0.0,1.0,0.0,0.0,1,0,1,1,1,False,True,False,False,1.051434,2.706375,1.001234,1
1,0.0,0.0,0.0,1.0,0.0,0,0,0,1,0,False,False,True,False,0.78607,2.121559,4.615554e-16,1
2,0.0,0.0,1.0,0.0,0.0,1,0,1,1,0,False,False,True,False,1.62639,-0.005028,0.4685773,1
3,0.0,0.0,1.0,0.0,0.0,0,0,0,1,1,False,False,False,True,0.255342,1.437358,0.7154182,1
4,0.0,0.0,0.0,1.0,0.0,0,1,0,1,0,False,False,True,False,1.582163,1.501184,-0.6357112,1


Swap df_fin and df

In [1727]:
df_temp = df
df = df_fin
df_fin = df_temp

In [1728]:
df.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,gender,hypertension,heart_disease,ever_married,Residence_type,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,avg_glucose_level,bmi,stroke
0,0.0,0.0,1.0,0.0,0.0,1,0,1,1,1,False,True,False,False,1.051434,2.706375,1.001234,1
1,0.0,0.0,0.0,1.0,0.0,0,0,0,1,0,False,False,True,False,0.78607,2.121559,4.615554e-16,1
2,0.0,0.0,1.0,0.0,0.0,1,0,1,1,0,False,False,True,False,1.62639,-0.005028,0.4685773,1
3,0.0,0.0,1.0,0.0,0.0,0,0,0,1,1,False,False,False,True,0.255342,1.437358,0.7154182,1
4,0.0,0.0,0.0,1.0,0.0,0,1,0,1,0,False,False,True,False,1.582163,1.501184,-0.6357112,1


# Pre-processing

In [1729]:
X = pd.DataFrame(df.iloc[:, :-1].values, columns=df.columns[:-1])
y = pd.DataFrame(df.iloc[:, -1].values, columns=[df.columns[-1]])

In [1730]:
X.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,gender,hypertension,heart_disease,ever_married,Residence_type,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,avg_glucose_level,bmi
0,0.0,0.0,1.0,0.0,0.0,1,0,1,1,1,False,True,False,False,1.051434,2.706375,1.001234
1,0.0,0.0,0.0,1.0,0.0,0,0,0,1,0,False,False,True,False,0.78607,2.121559,0.0
2,0.0,0.0,1.0,0.0,0.0,1,0,1,1,0,False,False,True,False,1.62639,-0.005028,0.468577
3,0.0,0.0,1.0,0.0,0.0,0,0,0,1,1,False,False,False,True,0.255342,1.437358,0.715418
4,0.0,0.0,0.0,1.0,0.0,0,1,0,1,0,False,False,True,False,1.582163,1.501184,-0.635711


In [1731]:
y.head()

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1


# Split train & test

In [1732]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [1733]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [1734]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1735]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7777, 17) (1945, 17) (7777, 1) (1945, 1)


Grid Search

In [1736]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha' : [1.0, 1.25, 1.5, 1.75, 2.0],
    'binarize' : [0.0, 0.25, 0.5, 0.75, 1.0],
}

grid_search = GridSearchCV(estimator = BernoulliNB(),
                           param_grid = param_grid,
                           cv = 5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(best_params)

{'alpha': 1.0, 'binarize': 0.5}


In [1737]:
model = BernoulliNB(alpha = 1.0, binarize = 0.5)

In [1738]:
model.fit(X_train, y_train)

In [1739]:
print("Training accuracy: ", model.score(X_train, y_train)*100)

Training accuracy:  75.9033046161759


In [1740]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
y_pred = model.predict(X_test)

print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, y_pred)*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, y_pred)*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, y_pred)*100:.2f}%')
print()

Confusion Matrix: 
 [[661 314]
 [152 818]]
Model testing accuracy: 76.04113110539846 %
K-Fold Validation Mean Accuracy: 75.93%
Precision Score: 72.26%
Recall Score: 84.33%
F1 Score: 77.83%



In [1741]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.68      0.74       975
           1       0.72      0.84      0.78       970

    accuracy                           0.76      1945
   macro avg       0.77      0.76      0.76      1945
weighted avg       0.77      0.76      0.76      1945

