In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier

In [4]:
stroke_df = pd.read_csv("Resources/stroke_cleaned.csv")
stroke_df.head()

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,Work,Residence,Glucose,BMI,Smoking,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


# Split data into Training and Testing 

In [5]:
# Create our features
X = stroke_df.drop("Stroke", axis=1)
X = pd.get_dummies(X)

# Create our target
y = stroke_df['Stroke']

In [6]:
X.describe()

Unnamed: 0,ID,Age,Hypertension,HeartDisease,Glucose,BMI,Gender_Female,Gender_Male,EverMarried_No,EverMarried_Yes,...,Work_Never_worked,Work_Private,Work_Self-employed,Work_children,Residence_Rural,Residence_Urban,Smoking_Unknown,Smoking_formerly smoked,Smoking_never smoked,Smoking_smokes
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,...,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.868989,0.091891,0.049511,105.297402,28.89456,0.590261,0.409739,0.347188,0.652812,...,0.004482,0.572535,0.157905,0.136716,0.492665,0.507335,0.30216,0.170334,0.377343,0.150163
std,20995.468407,22.555878,0.288901,0.216954,44.42555,7.85432,0.491836,0.491836,0.476125,0.476125,...,0.066808,0.494761,0.364689,0.343582,0.499997,0.499997,0.459241,0.375964,0.484771,0.357268
min,77.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
y.value_counts()

0    4699
1     209
Name: Stroke, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
Balance_random_forest = BalancedRandomForestClassifier(n_estimators = 100)
Balance_random_forest.fit(X_train, y_train)

BalancedRandomForestClassifier()

In [10]:
# Calculated the balanced accuracy score
y_pred = Balance_random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7772013093289689

In [11]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[787, 388],
       [  6,  46]])

In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.67      0.88      0.80      0.77      0.58      1175
          1       0.11      0.88      0.67      0.19      0.77      0.61        52

avg / total       0.95      0.68      0.88      0.77      0.77      0.58      1227



In [13]:
# List the features sorted in descending order by feature importance
featureNames = X.columns
sorted(zip(Balance_random_forest.feature_importances_, X.columns), reverse=True)

[(0.29943698612079317, 'Age'),
 (0.13011068629008632, 'Glucose'),
 (0.12730177479929347, 'ID'),
 (0.12339775165925453, 'BMI'),
 (0.03853747633415701, 'Hypertension'),
 (0.03194105368404102, 'EverMarried_Yes'),
 (0.03099340876440615, 'EverMarried_No'),
 (0.027619101302946918, 'HeartDisease'),
 (0.020599384356065946, 'Smoking_never smoked'),
 (0.019003628906789777, 'Smoking_Unknown'),
 (0.017948270060023093, 'Work_Private'),
 (0.017663264806428783, 'Work_Self-employed'),
 (0.01672993235834327, 'Smoking_smokes'),
 (0.016358648921986976, 'Smoking_formerly smoked'),
 (0.015866794376554406, 'Gender_Male'),
 (0.015394898569585449, 'Residence_Urban'),
 (0.014675469849523315, 'Gender_Female'),
 (0.014000099942600213, 'Residence_Rural'),
 (0.012528061341474417, 'Work_Govt_job'),
 (0.009829362947697791, 'Work_children'),
 (6.394460794806922e-05, 'Work_Never_worked')]

In [14]:
d_1 = {"y_test": y_test,"y_pred": y_pred}
df_1 = pd.DataFrame(d_1)
df_1

Unnamed: 0,y_test,y_pred
2251,0,0
1101,0,1
4899,0,1
3586,0,0
4068,0,1
...,...,...
452,0,0
810,0,1
1776,0,0
4701,0,0


# Easy Ensemble AdaBoost Classifier

In [15]:
# Train the EasyEnsembleClassifier
easy_ensemble = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy_ensemble.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = easy_ensemble.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7431587561374795

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[707, 468],
       [  6,  46]])

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.60      0.88      0.75      0.73      0.52      1175
          1       0.09      0.88      0.60      0.16      0.73      0.55        52

avg / total       0.95      0.61      0.87      0.72      0.73      0.52      1227



In [19]:
d_2 = {"y_test": y_test,"y_pred": y_pred}
df_2 = pd.DataFrame(d_2)
df_2

Unnamed: 0,y_test,y_pred
2251,0,0
1101,0,1
4899,0,1
3586,0,0
4068,0,1
...,...,...
452,0,0
810,0,1
1776,0,0
4701,0,0
