# Preparation

In [1]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("D:\\Learn\\Uni\\ML\\CapstoneProject\\data\\healthcare-dataset-stroke-data.csv")

df = df.drop('id', axis=1)
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.891862,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Preprocessing

label encode

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.891862,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


implementing oversampling with SMOTE

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

In [6]:
y = df['stroke']
X = df[features]

In [7]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

split dataset into training set and testing set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
count = y_test.value_counts().get(0,0)
print(count)


(7771, 10) (1943, 10) (7771,) (1943,)
963


# Training model

In [9]:
from sklearn import tree

In [10]:
model = tree.DecisionTreeClassifier()

In [11]:
model.fit(X_train, y_train)

In [12]:
print("Training accuracy on Decision Tree: ", model.score(X_train, y_train)*100)

Training accuracy on Decision Tree:  100.0


In [13]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))      
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()


Confusion Matrix: 
 [[869  94]
 [ 77 903]]
Model testing accuracy: 91.19917653113741 %
K-Fold Validation Mean Accuracy: 90.46%
Precision Score: 90.57%
Recall Score: 92.14%
F1 Score: 91.35%



Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion' : ['gini' , 'entropy','log_loss'],
    'splitter' : ['best','random'],
    'max_features' : ['auto', 'sqrt', 'log2'],
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(),
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(best_params)

{'criterion': 'log_loss', 'max_features': 'log2', 'splitter': 'best'}


In [15]:
model = tree.DecisionTreeClassifier(criterion= "log_loss",max_features = "log2",splitter="best")

In [16]:
model.fit(X_train, y_train)

# Score

In [17]:
print("Training accuracy on Decision Tree: ", model.score(X_train, y_train)*100)

Training accuracy on Decision Tree:  100.0


In [18]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))      
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()


Confusion Matrix: 
 [[852 111]
 [ 87 893]]
Model testing accuracy: 89.80957282552754 %
K-Fold Validation Mean Accuracy: 90.16%
Precision Score: 88.94%
Recall Score: 91.12%
F1 Score: 90.02%

