In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN

In [2]:
# Data Import
file_path = Path("Resources/healthcare-dataset-stroke-data.csv")


stroke_df = pd.read_csv(file_path)

stroke_df.drop(columns=['id'], inplace=True)

stroke_df = stroke_df.dropna()

stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [3]:
stroke_df=pd.get_dummies(stroke_df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
stroke_df.drop_duplicates(inplace=True)
stroke_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [4]:
X = stroke_df.drop(columns="stroke", axis=1)

scaler=MinMaxScaler()
scaler.fit(X[["age", "avg_glucose_level", "bmi"]])
X_scaled=scaler.transform(X[["age", "avg_glucose_level", "bmi"]])
X_scaled

array([[0.81689453, 0.80126489, 0.30126002],
       [0.97558594, 0.23451205, 0.25429553],
       [0.59716797, 0.53600776, 0.27605956],
       ...,
       [0.42626953, 0.12865848, 0.2325315 ],
       [0.62158203, 0.51320284, 0.17525773],
       [0.53613281, 0.13922999, 0.18213058]])

In [5]:
X_scaled = pd.DataFrame(data=X_scaled, columns=["age", "avg_glucose_level", "bmi"], index=X.index)
X_scaled = X_scaled.join(X.drop(columns=["age", "avg_glucose_level", "bmi"], axis=1))
X_scaled.head()

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.816895,0.801265,0.30126,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0
2,0.975586,0.234512,0.254296,0,1,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
3,0.597168,0.536008,0.27606,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
4,0.963379,0.549349,0.15693,1,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
5,0.987793,0.605161,0.214204,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0


In [6]:
y = stroke_df['stroke']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [8]:
X_train

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
3586,0.462891,0.243099,0.169530,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,1,0
716,0.353027,0.019112,0.292096,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,1
483,0.108887,0.182208,0.111111,0,0,0,1,0,1,0,...,0,0,0,1,1,0,1,0,0,0
2730,0.328613,0.257363,0.219931,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1
620,0.401855,0.761010,0.335624,0,0,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3957,0.462891,0.199658,0.193585,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
3810,0.475098,0.106731,0.209622,0,0,1,0,0,0,1,...,0,1,0,0,0,1,1,0,0,0
666,0.926758,0.066891,0.286369,1,0,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
3820,0.743652,0.272228,0.176403,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [9]:
from collections import Counter
rus = RandomUnderSampler(random_state=1)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)
Counter(y_undersampled)

Counter({0: 158, 1: 158})

In [10]:
ros=RandomOverSampler()
X_oversample, y_oversample = ros.fit_resample(X_train, y_train)
Counter(y_oversample)

Counter({0: 3523, 1: 3523})

In [11]:
X_train

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
3586,0.462891,0.243099,0.169530,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,1,0
716,0.353027,0.019112,0.292096,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,1
483,0.108887,0.182208,0.111111,0,0,0,1,0,1,0,...,0,0,0,1,1,0,1,0,0,0
2730,0.328613,0.257363,0.219931,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1
620,0.401855,0.761010,0.335624,0,0,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3957,0.462891,0.199658,0.193585,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
3810,0.475098,0.106731,0.209622,0,0,1,0,0,0,1,...,0,1,0,0,0,1,1,0,0,0
666,0.926758,0.066891,0.286369,1,0,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
3820,0.743652,0.272228,0.176403,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [12]:
from imblearn.combine import SMOTEENN

smote = SMOTEENN()
X_smoteen, y_smoteen = smote.fit_resample(X_train, y_train)
Counter(y_smoteen)

Counter({0: 2945, 1: 3188})

#### Model 1: Undersampled

In [13]:
# Create model instance
rf_model_u = RandomForestClassifier(n_estimators=512) 

In [14]:
# Fit the model
rf_model_u = rf_model_u.fit(X_undersampled, y_undersampled)

In [15]:
# Making predictions
predictions_u = rf_model_u.predict(X_test)

In [16]:
# confusion matrix.
cmu = confusion_matrix(y_test, predictions_u)

cmu_df = pd.DataFrame(
    cmu, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [17]:
# accuracy score.
acc_score_u = accuracy_score(y_test, predictions_u)

In [18]:
# results
print("Undersampled Confusion Matrix")
display(cmu_df)
print(f"Accuracy Score : {acc_score_u}")
print("Classification Report")
print(classification_report(y_test, predictions_u))

Undersampled Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,874,303
Actual 1,11,40


Accuracy Score : 0.744299674267101
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      1177
           1       0.12      0.78      0.20        51

    accuracy                           0.74      1228
   macro avg       0.55      0.76      0.53      1228
weighted avg       0.95      0.74      0.82      1228



#### Model 2: Oversampled

In [19]:
# Create model instance
rf_model_o = RandomForestClassifier(n_estimators=512) 

In [20]:
# Fit the model
rf_model_o = rf_model_o.fit(X_oversample, y_oversample)

In [21]:
# Making predictions
predictions_o = rf_model_o.predict(X_test)

In [22]:
# confusion matrix.
cmo = confusion_matrix(y_test, predictions_o)

cmo_df = pd.DataFrame(
    cmo, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [23]:
# accuracy score.
acc_score_o = accuracy_score(y_test, predictions_o)

In [24]:
# results
print("Oversampled Confusion Matrix")
display(cmo_df)
print(f"Accuracy Score : {acc_score_o}")
print("Classification Report")
print(classification_report(y_test, predictions_o))

Oversampled Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1173,4
Actual 1,51,0


Accuracy Score : 0.9552117263843648
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1177
           1       0.00      0.00      0.00        51

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.92      0.96      0.94      1228



#### Model 3: SMOTE

In [25]:
# Create model instance
rf_model_s = RandomForestClassifier(n_estimators=512) 

In [26]:
# Fit the model
rf_model_s = rf_model_s.fit(X_smoteen, y_smoteen)

In [27]:
# Making predictions
predictions_s = rf_model_s.predict(X_test)

In [28]:
# confusion matrix.
cms = confusion_matrix(y_test, predictions_o)

cms_df = pd.DataFrame(
    cms, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [29]:
# accuracy score.
acc_score_s = accuracy_score(y_test, predictions_s)

In [30]:
# results
print("SMOTEEN Confusion Matrix")
display(cms_df)
print(f"Accuracy Score : {acc_score_s}")
print("Classification Report")
print(classification_report(y_test, predictions_s))

SMOTEEN Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1173,4
Actual 1,51,0


Accuracy Score : 0.8672638436482085
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1177
           1       0.10      0.27      0.15        51

    accuracy                           0.87      1228
   macro avg       0.53      0.58      0.54      1228
weighted avg       0.93      0.87      0.90      1228



#### Model 3: Add Estimators

In [36]:
# Create model instance
rf_model_s2 = RandomForestClassifier(n_estimators=4112) 

# Fit the model
rf_model_s2 = rf_model_s2.fit(X_smoteen, y_smoteen)

# Making predictions
predictions_s2 = rf_model_s2.predict(X_test)

# confusion matrix.
cms2 = confusion_matrix(y_test, predictions_s2)

cms2_df = pd.DataFrame(
    cms2, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# accuracy score.
acc_score_s2 = accuracy_score(y_test, predictions_s2)

In [37]:
# results
print("SMOTEEN Confusion Matrix 2")
display(cms2_df)
print(f"Accuracy Score : {acc_score_s2}")
print("Classification Report")
print(classification_report(y_test, predictions_s2))

SMOTEEN Confusion Matrix 2


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1048,129
Actual 1,34,17


Accuracy Score : 0.8672638436482085
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1177
           1       0.12      0.33      0.17        51

    accuracy                           0.87      1228
   macro avg       0.54      0.61      0.55      1228
weighted avg       0.93      0.87      0.90      1228

