STROKE DATASET-TO PREDICT WHETHER A PERSON WILL HAVE STROKE BASED ON THE FEATURES

In [20]:
#Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

In [22]:
#Load The Dataset
df=pd.read_csv('healthcare-dataset-stroke-data.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'healthcare-dataset-stroke-data.csv'

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Drop 'id'
df.drop('id', axis=1, inplace=True)

In [None]:
#Check the number of null values in each column
df.isnull().sum()

In [None]:
#Handle Missing Values
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

In [None]:
df.isnull().sum()

In [None]:
# Finding the ouliers in the datasets
numeric_df = df.select_dtypes(include='number')
melted = numeric_df.melt()
sns.boxplot(x='variable', y='value', data=melted)
plt.xticks(rotation=45)
plt.title('Outliers')
plt.tight_layout()
plt.show()

In [None]:
#handling the outliers 
def iqr_cap(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower,upper)
iqr_cap(df, 'avg_glucose_level')
iqr_cap(df, 'bmi')


In [None]:
#Plotting the outliers
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['avg_glucose_level'])
plt.title("Glucose Level After IQR Capping")
plt.subplot(1, 2, 2)
sns.boxplot(y=df['bmi'])
plt.title("BMI After IQR Capping")
plt.tight_layout()
plt.show()

In [None]:
df.shape

In [None]:
sns.countplot(x='stroke', data=df)
plt.title("Stroke Distribution (0 = No, 1 = Yes)")
plt.show()

In [None]:
sns.histplot(df['age'], kde=True)
plt.title("Age Distribution")
plt.show()

In [None]:
sns.countplot(x='gender', hue='stroke', data=df)
plt.title("Stroke Count by Gender")
plt.show()

In [None]:
sns.countplot(x='smoking_status', hue='stroke', data=df)
plt.title("Stroke Count by Smoking Status")
plt.xticks(rotation=0)
plt.show()

In [None]:
sns.countplot(x='work_type', hue='stroke', data=df)
plt.title("Stroke Count by Work Type")
plt.xticks(rotation=0)
plt.show()

In [None]:
#Encoding the data
from sklearn.preprocessing import LabelEncoder
l_gen=LabelEncoder()
l_evrmd=LabelEncoder()
l_restype=LabelEncoder()
l_worktype=LabelEncoder()
l_smoking=LabelEncoder()
df['gender']=l_gen.fit_transform(df['gender'])
df['ever_married']=l_evrmd.fit_transform(df['ever_married'])
df['Residence_type']=l_restype.fit_transform(df['Residence_type'])
df['work_type']=l_worktype.fit_transform(df['work_type'])
df['smoking_status']=l_smoking.fit_transform(df['smoking_status'])


In [None]:
df

In [None]:
sns.histplot(df['age'], kde=True)
plt.title("Age Distribution")
plt.show()


In [None]:
sns.countplot(x='gender', hue='stroke', data=df)
plt.title("Stroke Count by Gender")
plt.show()

In [None]:
sns.countplot(x='smoking_status', hue='stroke', data=df)
plt.title("Stroke Count by Smoking Status")
plt.xticks(rotation=0)
plt.show()

In [None]:
sns.countplot(x='work_type', hue='stroke', data=df)
plt.title("Stroke Count by Work Type")
plt.xticks(rotation=0)
plt.show()

In [None]:
df['work_type'].unique()

In [None]:
#  Split Features & Target
x = df.drop('stroke', axis=1)
y = df['stroke']

#  Handle Imbalance with SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)

# Split into Train and Test
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.3, random_state=42)

In [None]:
y_test.value_counts()

In [None]:
# 10. Standardize Feature
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
sns.countplot(x='stroke', data=df)
plt.title("Stroke Distribution (0 = No, 1 = Yes)")
plt.show()

Logistic Regression

In [None]:
lr=LogisticRegression()
lr_params = {'C': [0.1, 1, 10]}
lr_grid = GridSearchCV(lr,lr_params, cv=5, scoring='f1') 
lr_grid.fit(x_train, y_train)

In [None]:
model_lr=lr_grid.best_estimator_
y_pred_lr=model_lr.predict(x_test)

In [None]:
print("\nLogistics Regression")
y_prob_lr = model_lr.predict_proba(x_test)[:,1]
y_pred_lr = model_lr.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lr)) 
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr)) 
print("Classification Report:\n", classification_report(y_test, y_pred_lr))


RandomForest Classifier

In [None]:
rf=RandomForestClassifier()
rf_params = {'n_estimators': [100,150, 200], 'max_depth': [None, 10, 20]}
rf_grid = GridSearchCV(rf,rf_params, cv=5, scoring='f1') 
rf_grid.fit(x_train, y_train)

In [None]:
model_rf = rf_grid.best_estimator_
y_pred_rf=model_rf.predict(x_test)

In [None]:
print("\nRandomForestClassifier")
y_prob_rf = model_rf.predict_proba(x_test)[:,1] 
y_pred_rf = model_rf.predict(x_test) 
print("Accuracy:", accuracy_score(y_test, y_pred_rf)) 
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf)) 
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


DecisionTreeClassifier

In [None]:
dt=DecisionTreeClassifier()
dt_params = {'max_depth': [None, 5, 10]}
dt_grid = GridSearchCV(dt,dt_params, cv=5, scoring='f1') 
dt_grid.fit(x_train, y_train)

In [None]:
model_dt = dt_grid.best_estimator_
y_pred_dt=model_dt.predict(x_test)


In [None]:
print("\nDecisionTreeClassifier")
y_prob_dt = model_dt.predict_proba(x_test)[:,1] 
y_pred_dt = model_dt.predict(x_test) 
print("Accuracy:", accuracy_score(y_test, y_pred_dt)) 
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("ROC AUC:", roc_auc_score(y_test, y_prob_dt)) 
print("Classification Report:\n", classification_report(y_test, y_pred_dt))


Support Vector Machine

In [None]:
svm=SVC()
svm_params= {'C': [0.1, 1,], 'kernel': ['linear'],'gamma': ['scale']}
svm_grid= GridSearchCV(svm,svm_params, cv=5, scoring='f1') 
svm_grid.fit(x_train, y_train)

In [None]:
model_svm = svm_grid.best_estimator_
y_pred_svm=model_svm.predict(x_test)

In [None]:
print("\nSupportVectorMachine")
y_pred_svm = model_svm.predict(x_test) 
print("Accuracy:", accuracy_score(y_test, y_pred_svm)) 
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("ROC AUC:", roc_auc_score(y_test, y_pred_svm)) 
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

XGBoost

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()
xgb_params = {'n_estimators': [100],'learning_rate': [0.05],'max_depth': [3, 5]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3,scoring='accuracy')
xgb_grid.fit(x_train, y_train)

In [None]:
model_xgb = xgb_grid.best_estimator_
y_pred_xgb = model_xgb.predict(x_test)

In [None]:
print("\nXGBoost")
y_pred_xgb = model_xgb.predict(x_test) 
print("Accuracy:", accuracy_score(y_test, y_pred_xgb)) 
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, y_pred_xgb)) 
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


Naive Bayes

In [None]:
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)

In [None]:

print("\nNaive Bayes")
y_pred_nb = model_nb.predict(x_test)
y_prob_nb = model_nb.predict_proba(x_test)[:,1]
print("Accuracy:", accuracy_score(y_test, y_pred_nb)) 
print("Precision:", precision_score(y_test, y_pred_nb)) 
print("Recall:", recall_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb)) 
print("ROC AUC:", roc_auc_score(y_test, y_prob_nb)) 
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

KNeighbors Classifier

In [None]:
knn=KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance'],'metric': ['euclidean']}
knn_grid= GridSearchCV(knn, knn_params, cv=5, scoring='accuracy')
knn_grid.fit(x_train, y_train)

In [None]:
model_knn= knn_grid.best_estimator_
y_pred_knn = model_knn.predict(x_test)

In [None]:
print("\nKNeighborsClassifier")
y_pred_knn = model_knn.predict(x_test)
y_prob_knn = model_knn.predict_proba(x_test)[:,1]
print("Accuracy:", accuracy_score(y_test, y_pred_knn)) 
print("Precision:", precision_score(y_test, y_pred_knn)) 
print("Recall:", recall_score(y_test, y_pred_knn))
print("F1 Score:", f1_score(y_test, y_pred_knn)) 
print("ROC AUC:", roc_auc_score(y_test, y_prob_knn)) 
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

Save the Best Model, scaler and encoders  using  joblib

In [None]:
import joblib
joblib.dump(model_rf,open('model_rf.pkl','wb')) #save model
joblib.dump(scaler,open('scaler.pkl','wb'))
joblib.dump(l_gen,open('l_gen.pkl','wb'))
joblib.dump(l_evrmd,open('l_evrmd.pkl','wb'))
joblib.dump(l_restype,open('l_restype.pkl','wb'))
joblib.dump(l_worktype,open('l_worktype.pkl','wb'))
joblib.dump(l_smoking,open('l_smoking.pkl','wb'))

In [None]:
model_rf=joblib.load(open('model_rf.pkl','rb'))
scaler=joblib.load(open('scaler.pkl','rb'))
l_gen=joblib.load(open('l_gen.pkl','rb'))
l_restype=joblib.load(open('l_restype.pkl','rb'))
l_evrmd=joblib.load(open('l_evrmd.pkl','rb'))
l_worktype=joblib.load(open('l_worktype.pkl','rb'))
l_smoking=joblib.load(open('l_smoking.pkl','rb'))

In [None]:
x

Prediction

In [None]:
gender = input("Enter gender (Male, Female, Other): ")
age = float(input("Enter age: "))
hypertension = int(input("Has hypertension? (0 = No, 1 = Yes): "))
heart_disease = int(input("Has heart disease? (0 = No, 1 = Yes): "))
ever_married = input("Ever married? (Yes or No): ")
work_type = input("Enter work type (Private, Self-employed, Govt_job, children, Never_worked): ")
residence_type = input("Enter residence type (Urban or Rural): ")
avg_glucose_level = float(input("Enter average glucose level: "))
bmi = float(input("Enter BMI: "))
smoking_status = input("Smoking status (formerly smoked, never smoked, smokes, Unknown): ")


data = {
    'gender': [gender],
    'age': [age],
    'hypertension': [hypertension],
    'heart_disease': [heart_disease],
    'ever_married': [ever_married],
    'work_type': [work_type],
    'Residence_type': [residence_type],
    'avg_glucose_level': [avg_glucose_level],
    'bmi': [bmi],
    'smoking_status': [smoking_status]
}
df = pd.DataFrame(data)

# Encode categorical values
df['gender'] = l_gen.transform(df['gender'])
df['ever_married'] = l_evrmd.transform(df['ever_married'])
df['Residence_type'] = l_restype.transform(df['Residence_type'])
df['work_type'] = l_worktype.transform(df['work_type'])
df['smoking_status'] = l_smoking.transform(df['smoking_status'])

# Scale numerical features
df_scaled = scaler.transform(df)
# Predict
predict= model_rf.predict(df_scaled)[0]
# Output
if predict == 1:
    print(f"The person is likely to have a stroke")
else:
    print(f"The person is not likely to have a stroke")