## Importing Libraries

In [17]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Reading data

In [18]:
data_preprocessed = pd.read_csv('preprocessed.csv')

In [19]:
data_preprocessed.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,1,81.0,0,0,1,2,1,186.21,29.0,1,1
5,1,74.0,1,1,1,2,0,70.09,27.4,2,1
6,0,69.0,0,0,0,2,1,94.39,22.8,2,1
7,0,78.0,0,0,1,2,1,58.57,24.2,0,1
8,0,81.0,1,0,1,2,0,80.43,29.7,2,1
9,0,61.0,0,1,1,0,0,120.46,36.8,3,1


## Declaring Variables

In [20]:
y = data_preprocessed['stroke']
x = data_preprocessed.drop(['stroke'],axis=1)

## Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(x)
x_scaled = std.transform(x)

## Train - Test Split

In [22]:
# Import the module for the split
from sklearn.model_selection import train_test_split

# Split the variables with an 80-20 split and some random state
# To have the same split as mine, use random_state = 365
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=365)

## Model testing on 8 differnt models

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

In [26]:
from imblearn.over_sampling import SMOTE

In [27]:
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [28]:
models = []
models.append(['Logistic Regreesion', LogisticRegression(random_state=0)])
models.append(['SVM', SVC(random_state=0)])
models.append(['KNeighbors', KNeighborsClassifier()])
models.append(['GaussianNB', GaussianNB()])
models.append(['BernoulliNB', BernoulliNB()])
models.append(['Decision Tree', DecisionTreeClassifier(random_state=0)])
models.append(['Random Forest', RandomForestClassifier(random_state=0)])
models.append(['XGBoost', XGBClassifier(eval_metric= 'error')])

lst_1= []

for m in range(len(models)):
    lst_2= []
    model = models[m][1]
    model.fit(x_train_res, y_train_res)
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)  #Confusion Matrix
    accuracies = cross_val_score(estimator = model, X = x_train_res, y = y_train_res, cv = 10)   #K-Fold Validation
    roc = roc_auc_score(y_test, y_pred)  #ROC AUC Score
    precision = precision_score(y_test, y_pred)  #Precision Score
    recall = recall_score(y_test, y_pred)  #Recall Score
    f1 = f1_score(y_test, y_pred)  #F1 Score
    print(models[m][0],':')
    print(cm)
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print('')
    print("K-Fold Validation Mean Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print('')
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    print('')
    print('ROC AUC Score: {:.2f}'.format(roc))
    print('')
    print('Precision: {:.2f}'.format(precision))
    print('')
    print('Recall: {:.2f}'.format(recall))
    print('')
    print('F1: {:.2f}'.format(f1))
    print('-----------------------------------')
    print('')
    lst_2.append(models[m][0])
    lst_2.append((accuracy_score(y_test, y_pred))*100) 
    lst_2.append(accuracies.mean()*100)
    lst_2.append(accuracies.std()*100)
    lst_2.append(roc)
    lst_2.append(precision)
    lst_2.append(recall)
    lst_2.append(f1)
    lst_1.append(lst_2)


Logistic Regreesion :
[[707 235]
 [  8  32]]
Accuracy Score:  0.7525458248472505

K-Fold Validation Mean Accuracy: 78.20 %

Standard Deviation: 1.21 %

ROC AUC Score: 0.78

Precision: 0.12

Recall: 0.80

F1: 0.21
-----------------------------------

SVM :
[[749 193]
 [ 16  24]]
Accuracy Score:  0.7871690427698574

K-Fold Validation Mean Accuracy: 85.49 %

Standard Deviation: 0.82 %

ROC AUC Score: 0.70

Precision: 0.11

Recall: 0.60

F1: 0.19
-----------------------------------

KNeighbors :
[[808 134]
 [ 27  13]]
Accuracy Score:  0.8360488798370672

K-Fold Validation Mean Accuracy: 91.39 %

Standard Deviation: 1.14 %

ROC AUC Score: 0.59

Precision: 0.09

Recall: 0.33

F1: 0.14
-----------------------------------

GaussianNB :
[[705 237]
 [  9  31]]
Accuracy Score:  0.7494908350305499

K-Fold Validation Mean Accuracy: 76.40 %

Standard Deviation: 1.05 %

ROC AUC Score: 0.76

Precision: 0.12

Recall: 0.78

F1: 0.20
-----------------------------------

BernoulliNB :
[[553 389]
 [ 10  30



XGBoost :
[[926  16]
 [ 38   2]]
Accuracy Score:  0.945010183299389

K-Fold Validation Mean Accuracy: 96.61 %

Standard Deviation: 5.55 %

ROC AUC Score: 0.52

Precision: 0.11

Recall: 0.05

F1: 0.07
-----------------------------------



In [29]:
df = pd.DataFrame(lst_1, columns= ['Model', 'Accuracy', 'K-Fold Mean Accuracy', 'Std. Deviation', 'ROC AUC', 'Precision', 'Recall', 'F1'])

In [30]:
df.sort_values(by= ['Accuracy', 'K-Fold Mean Accuracy'], inplace= True, ascending= False)

In [35]:
df

Unnamed: 0,Model,Accuracy,K-Fold Mean Accuracy,Std. Deviation,ROC AUC,Precision,Recall,F1
7,XGBoost,94.501018,96.607555,5.553394,0.516507,0.111111,0.05,0.068966
6,Random Forest,94.297352,97.192431,2.359264,0.527415,0.136364,0.075,0.096774
5,Decision Tree,87.678208,92.747489,3.01532,0.516852,0.054945,0.125,0.076336
2,KNeighbors,83.604888,91.389158,1.13775,0.591375,0.088435,0.325,0.139037
1,SVM,78.716904,85.49365,0.823754,0.697558,0.110599,0.6,0.18677
0,Logistic Regreesion,75.254582,78.200591,1.212671,0.775265,0.11985,0.8,0.208469
3,GaussianNB,74.949084,76.403979,1.054285,0.761704,0.115672,0.775,0.201299
4,BernoulliNB,59.368635,72.983451,1.187622,0.668524,0.071599,0.75,0.130719


#### Looks like Random Forest classifier and XGBoost classifier both gave us very good results. Lets do some hyperparameter tuning to see if we can come to a conclusion

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid_models = [(LogisticRegression(),[{'C':[0.25,0.5,0.75,1],'random_state':[0]}]), 
               (KNeighborsClassifier(),[{'n_neighbors':[5,7,8,10], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}]), 
               (SVC(),[{'C':[0.25,0.5,0.75,1],'kernel':['linear', 'rbf'],'random_state':[0]}]), 
               (GaussianNB(),[{'var_smoothing': [1e-09]}]), 
               (BernoulliNB(), [{'alpha': [0.25, 0.5, 1]}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}]), 
              (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}])]

In [34]:
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv = 10)
    grid.fit(x_train_res, y_train_res)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest Accuracy : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)
    print('')
    print('----------------')
    print('')

LogisticRegression():
Best Accuracy : 78.24%
Best Parameters :  {'C': 0.25, 'random_state': 0}

----------------

KNeighborsClassifier():
Best Accuracy : 92.56%
Best Parameters :  {'metric': 'manhattan', 'n_neighbors': 5}

----------------

SVC():
Best Accuracy : 85.49%
Best Parameters :  {'C': 1, 'kernel': 'rbf', 'random_state': 0}

----------------

GaussianNB():
Best Accuracy : 76.40%
Best Parameters :  {'var_smoothing': 1e-09}

----------------

BernoulliNB():
Best Accuracy : 72.98%
Best Parameters :  {'alpha': 0.25}

----------------

DecisionTreeClassifier():
Best Accuracy : 93.80%
Best Parameters :  {'criterion': 'entropy', 'random_state': 0}

----------------

RandomForestClassifier():
Best Accuracy : 97.26%
Best Parameters :  {'criterion': 'entropy', 'n_estimators': 200, 'random_state': 0}

----------------





XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None):
Best Accuracy : 95.34%
Best Parameters :  {'eval_metric': 'error', 'learning_rate': 0.1}

----------------



#### Looks like Random forest classifier gave the best accuracy with the given hyperparamemters. So we are going to use the same model for our purpose