In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



import plotly
import plotly.express as px

# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
import os

In [155]:
origin_data = pd.read_csv('../dataset/healthcare-dataset-stroke-data.csv')
data = origin_data.copy() 

In [156]:
origin_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [157]:
work_types = origin_data['ever_married'].unique()
print(work_types)

['Yes' 'No']


In [158]:
#Dropping the id column that has no correlation with the target variable
data.drop(['id'], axis=1, inplace=True)

In [159]:
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

In [160]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [161]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [162]:
# Convert Marrital Status, Residence and Gender into 0's and 1's
data['gender']=data['gender'].apply(lambda x : 1 if x=='Male' else 0) 
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
# Removing the observations that have smoking type unknown. 
data=data[data['smoking_status']!='Unknown']

In [163]:
# One Hot encoding smoking_status, work_type
data_dummies = data[['smoking_status','work_type']]
data_dummies=pd.get_dummies(data_dummies)
data.drop(columns=['smoking_status','work_type'],inplace=True)

In [164]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [165]:
data_stroke=data['stroke']
data.drop(columns=['stroke'],inplace=True)
data=data.merge(data_dummies,left_index=True, right_index=True,how='left')

In [166]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [167]:
X_train, X_test, Y_train, Y_test=train_test_split(data,data_stroke,test_size=0.2,random_state=16)

In [168]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
968,0,55.0,0,0,1,0,112.47,32.800000,False,False,True,False,False,True,False,False
4712,1,57.0,0,0,1,1,93.04,29.200000,False,True,False,False,False,True,False,False
2417,0,34.0,0,0,1,1,90.55,30.000000,False,True,False,True,False,False,False,False
522,1,40.0,0,0,1,0,89.77,28.893237,False,False,True,False,False,True,False,False
2181,0,17.0,0,0,0,0,70.03,23.100000,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1879,0,68.0,1,0,1,1,95.82,28.600000,False,True,False,False,False,True,False,False
784,0,58.0,0,0,1,1,76.99,29.000000,False,True,False,False,False,True,False,False
3069,0,60.0,1,1,1,1,220.24,36.800000,False,True,False,False,False,True,False,False
4740,1,60.0,0,0,1,0,234.45,36.800000,True,False,False,False,False,True,False,False


In [169]:
Y_train

968     0
4712    0
2417    0
522     0
2181    0
       ..
1879    0
784     0
3069    0
4740    0
3902    0
Name: stroke, Length: 2852, dtype: int64

In [170]:
sm = SMOTE(random_state=4)
X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())

In [171]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [172]:
from sklearn.impute import SimpleImputer

In [173]:
imputer = SimpleImputer(strategy='mean')
xtrain_imputed = imputer.fit_transform(X_train)
xtest_imputed = imputer.fit_transform(X_test)

In [174]:
classifier.fit(xtrain_imputed,Y_train)

In [175]:
predicted = classifier.predict(xtest_imputed)

In [176]:
predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,

In [184]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predicted)

0.8781512605042017

In [178]:
from sklearn.metrics import confusion_matrix

In [179]:
confusion_matrix(Y_test, predicted)

array([[615,  54],
       [ 33,  12]], dtype=int64)

In [180]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [181]:
mae = mean_absolute_error(Y_test, predicted)
mse = mean_squared_error(Y_test, predicted)
rmse = mean_squared_error(Y_test, predicted, squared=False)
r2 = r2_score(Y_test, predicted)

print("Regression Report:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Regression Report:
Mean Absolute Error (MAE): 0.12184873949579832
Mean Squared Error (MSE): 0.12184873949579832
Root Mean Squared Error (RMSE): 0.3490683879926659
R-squared (R2): -1.0633781763826602


In [182]:
import pickle

# Save the logistic regression model to a file
with open('./logistic_reg_model2.sav', 'wb') as model_file:
    pickle.dump(classifier, model_file)

In [183]:
pickle.load(open('logistic_reg_model2.sav', 'rb'))