In [1]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("D:\\Learn\\Uni\\ML\\CapstoneProject\\data\\healthcare-dataset-stroke-data.csv")
df = df.drop('id', axis=1)
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

# Preprocessing

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
df.head(15)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.891862,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,Female,59.0,0,0,Yes,Private,Rural,76.15,28.891862,Unknown,1
9,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [5]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

In [6]:
df.head(15)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.891862,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1
6,1,74.0,1,1,1,2,0,70.09,27.4,2,1
7,0,69.0,0,0,0,2,1,94.39,22.8,2,1
8,0,59.0,0,0,1,2,0,76.15,28.891862,0,1
9,0,78.0,0,0,1,2,1,58.57,24.2,0,1


In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

In [8]:
y = df['stroke']
X = df[features]

In [9]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7771, 10) (1943, 10) (7771,) (1943,)


In [11]:
from xgboost import XGBClassifier
model = XGBClassifier()


In [12]:
model.fit(X_train, y_train)

In [13]:
print("Training accuracy on XGBoost: ", model.score(X_train, y_train)*100)

Training accuracy on XGBoost:  99.26650366748166


In [14]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[920  65]
 [ 40 918]]
Model testing accuracy: 94.59598558929491 %
K-Fold Validation Mean Accuracy: 94.27%
Precision Score: 93.39%
Recall Score: 95.82%
F1 Score: 94.59%



In [15]:
from sklearn.model_selection import GridSearchCV, KFold

param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.5],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_lambda': [0, 0.5, 1],
    # 'base_score': [0.2, 0.5, 1]
}

grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param_grid=param_grid, cv=KFold(n_splits=3))
grid_search.fit(X_train, y_train)
best = grid_search.best_params_

print(best)

{'booster': 'gbtree', 'gamma': 0, 'learning_rate': 0.1, 'n_estimators': 1000, 'reg_lambda': 1}


In [16]:
model = XGBClassifier(n_estimators=1000, learning_rate=0.1, booster='gbtree', gamma=0, reg_lambda=0, n_jobs=-1)
model.fit(X_train, y_train)

In [17]:
print("Training accuracy on XGBoost: ", model.score(X_train, y_train)*100)

Training accuracy on XGBoost:  100.0


In [18]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))

print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[924  61]
 [ 34 924]]
Model testing accuracy: 95.11065362840968 %
Precision Score: 93.81%
Recall Score: 96.45%
F1 Score: 95.11%

