In [1]:
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import os 
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
file = "D:\\Learn\\Uni\\ML\\CapstoneProject\\data\\healthcare-dataset-stroke-data.csv"
df = pd.read_csv(file)

In [3]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

Pre Processing

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [6]:
LE = LabelEncoder()
df['gender'] = LE.fit_transform(df['gender'])
df['ever_married'] = LE.fit_transform(df['ever_married'])
df['work_type'] = LE.fit_transform(df['work_type'])
df['Residence_type'] = LE.fit_transform(df['Residence_type'])
df['smoking_status'] = LE.fit_transform(df['smoking_status'])

In [7]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.891862,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [8]:
X = df.iloc[:,1:-1].values
y = df.iloc[:,-1].values

print(X.shape, y.shape, sep='\n')

(5104, 10)
(5104,)


In [9]:
from imblearn.over_sampling import SMOTE

In [10]:
sm = SMOTE(random_state=8)
X_bal, y_bal = sm.fit_resample(X, y)


Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=8)

Smote

Creating the Model Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [15]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))

print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')

print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[720 241]
 [208 774]]
Model testing accuracy: 76.89140504374679 %
Precision Score: 76.26%
Recall Score: 78.82%
F1 Score: 77.52%



Find Tunes with GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV 

param_grid = {
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best=grid_search.best_params_
print(best)

{'solver': 'liblinear'}


Test Model

In [17]:
model = LogisticRegression(solver="lbfgs", max_iter=200,multi_class="auto", penalty="l2")
model.fit(X_train,y_train)

In [18]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))

print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')

print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[717 244]
 [227 755]]
Model testing accuracy: 75.75913535769429 %
Precision Score: 75.58%
Recall Score: 76.88%
F1 Score: 76.22%



In [19]:
from sklearn.metrics import accuracy_score, recall_score ,precision_score, f1_score
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [20]:
print("Model Accuracy on Testing Data: ", model.score(X_test,y_test))

Model Accuracy on Testing Data:  0.7575913535769428


In [21]:
test_preds = model.predict(X_test)
threshold = 0.25
test_preds = [1. if i > threshold else 0. for i in test_preds]
test_acc = accuracy_score(y_test, test_preds)
test_rec = recall_score(y_test, test_preds)
test_pre = precision_score(y_test, test_preds)
print('Test Set Metrics')
print('Model Accuracy:', test_acc)
print('Model Recall:', test_rec)
print('Model Precision:', test_pre)

Test Set Metrics
Model Accuracy: 0.7575913535769428
Model Recall: 0.7688391038696538
Model Precision: 0.7557557557557557
