In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
print(df.info())
print(df.describe())
df.drop_duplicates()
df.isnull().sum()

In [None]:
df0 = df[df['stroke'] == 0]
df1 = df[df['stroke'] == 1]
df0['bmi'].fillna(df0['bmi'].mean(), inplace=True)
df1['bmi'].fillna(df1['bmi'].mean(), inplace=True)
df = pd.concat([df0, df1], axis=0)
# df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.isnull().sum()

In [None]:
### Category and sex are categorical objects needed to convert numerical data.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender']= le.fit_transform(df['gender'])
df['smoking_status']= le.fit_transform(df['smoking_status'])
df['work_type']= le.fit_transform(df['work_type'])
df['ever_married']= le.fit_transform(df['ever_married'])
df['residence_type']= le.fit_transform(df['residence_type'])
df.info()

In [None]:
clms = ['gender','hypertension','heart_disease','ever_married','stroke']

import matplotlib.pyplot as plt
import seaborn as sns

def plotCols(dataframe):
    totalPlots = len(dataframe.columns)
    plotColumns = totalPlots//7+1
    plt.figure(figsize = (20, plotColumns*2.5))
    plotnumber = 1
    for column in dataframe.columns:
        if plotnumber <= totalPlots:
            ax = plt.subplot(plotColumns, 7, plotnumber)
            # sns.distplot(dataframe[column])
            sns.histplot(dataframe[column])
            plt.xlabel(column)
            
        plotnumber += 1

    plt.tight_layout()
    plt.show()
plotCols(df)

In [None]:
corr_metrix = df.corr()
cm = corr_metrix['stroke']*100
cm
plt.figure(figsize=(10,7))
sns.heatmap(corr_metrix*100, annot = True, cmap='RdYlGn')

In [None]:
X = df.drop(['id','gender','residence_type', 'stroke','work_type','smoking_status'], axis=1)
y = df['stroke']
print(X.shape,y.shape)
X.head()

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy="minority", random_state=42, k_neighbors=10, n_jobs=-1)
X, y = oversample.fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=320)
print(X_train.shape,X_test.shape)

In [None]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

print('Train Acc -> ',lr.score(X_train,y_train)*100)
print('Test Acc -> ',lr.score(X_test,y_test)*100)

In [None]:
### hyperparameter tuning
# penalty = ['l1', 'l2', 'elasticnet']
# C = [1,10,15,20]
# solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
# multi_class = ['auto', 'ovr', 'multinomial']

# params = {
    # 'penalty' : penalty,
    # 'C': C,
    # 'solver': solver
    # 'multi_class': multi_class
# }
# from sklearn.model_selection import GridSearchCV
# lr_grid = GridSearchCV(estimator=lr, param_grid=params, cv=5, n_jobs=-1)
# lr_grid.fit(X_train,y_train)
# print('Best Parameters -> ',lr_grid.best_params_)
# print('Best Parameters -> ',lr_grid.best_score_)

In [None]:
# # hyper parameter tuning of random forest

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import matplotlib.pyplot as plt

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)


# from sklearn.model_selection import GridSearchCV
# grid_param = {
#     'n_estimators' : [100, 120, 150, 200, 250],
#     'criterion' : ['gini', 'entropy', 'log_loss'],
#     'max_depth' : [7, 10, 13, 15],
#     'class_weight': ['balanced', 
#                      'balanced_subsample'
#                      ],
#     # 'min_samples_leaf' : [1, 3, 5, 7],
#     # 'min_samples_split' : [2, 3, 5, 7],
#     'max_features' : ['sqrt', 'log2']
# }

# grid_search_rf = GridSearchCV(rf, grid_param, cv = 5, n_jobs = -1, verbose = 3)
# grid_search_rf.fit(X_train, y_train)



# # best parameters and best score
# print(grid_search_rf.best_params_)
# print(grid_search_rf.best_score_)
# print(grid_search_rf.best_estimator_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced_subsample', max_depth=20, n_estimators=200)
rf.fit(X_train, y_train)
print('Train Acc -> ',rf.score(X_train,y_train)*100)
print('Test Acc -> ',rf.score(X_test,y_test)*100)
y_pred = rf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report,mean_absolute_error,mean_squared_error
cm = confusion_matrix(y_test, y_pred)
print(cm)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted')
plt.ylabel('Truth')
print(classification_report(y_test, y_pred))

In [None]:
y_pred = lr.predict(X_test)
print(y_pred)

from sklearn.metrics import confusion_matrix, classification_report,mean_absolute_error,mean_squared_error
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(6,3))
sns.heatmap(cm, annot=True, cmap='RdYlGn')
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('mae => ', mae)
print('mse => ', mse)
print('rmse => ', rmse)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./stroke-rf.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)

In [None]:
# evaluate model 
# LOAD AND USE THE SAVED MODEL USING PICKLE PACKAGE
with open(model_pkl_file, 'rb') as file:  
    loaded_rf = pickle.load(file)
    y_pred = loaded_rf.predict(X_test)

    # check results
    pred = loaded_rf.score(X_test, y_test)
    print(f"Accuracy : {pred * 100}%")