In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier

In [5]:
stroke_df = pd.read_csv("Resources/stroke_cleaned.csv")
stroke_df.head()

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,Work,Residence,Glucose,BMI,Smoking,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


# Split data into Training and Testing 

In [7]:
# Create our features
X = stroke_df.drop("Stroke", axis=1)
X = pd.get_dummies(X)

# Create our target
y = stroke_df['Stroke']

In [8]:
X.describe()

Unnamed: 0,ID,Age,Hypertension,HeartDisease,Glucose,BMI,Gender_Female,Gender_Male,EverMarried_No,EverMarried_Yes,...,Work_Never_worked,Work_Private,Work_Self-employed,Work_children,Residence_Rural,Residence_Urban,Smoking_Unknown,Smoking_formerly smoked,Smoking_never smoked,Smoking_smokes
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,...,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.868989,0.091891,0.049511,105.297402,28.89456,0.590261,0.409739,0.347188,0.652812,...,0.004482,0.572535,0.157905,0.136716,0.492665,0.507335,0.30216,0.170334,0.377343,0.150163
std,20995.468407,22.555878,0.288901,0.216954,44.42555,7.85432,0.491836,0.491836,0.476125,0.476125,...,0.066808,0.494761,0.364689,0.343582,0.499997,0.499997,0.459241,0.375964,0.484771,0.357268
min,77.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y.value_counts()

0    4699
1     209
Name: Stroke, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
Balance_random_forest = BalancedRandomForestClassifier(n_estimators = 100)
Balance_random_forest.fit(X_train, y_train)

BalancedRandomForestClassifier()

In [13]:
# Calculated the balanced accuracy score
y_pred = Balance_random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.764860883797054

In [14]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[758, 417],
       [  6,  46]])

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.65      0.88      0.78      0.76      0.56      1175
          1       0.10      0.88      0.65      0.18      0.76      0.58        52

avg / total       0.95      0.66      0.87      0.76      0.76      0.56      1227



In [16]:
# List the features sorted in descending order by feature importance
featureNames = X.columns
sorted(zip(Balance_random_forest.feature_importances_, X.columns), reverse=True)

[(0.3147893275977494, 'Age'),
 (0.14457623651190848, 'Glucose'),
 (0.11856530667072376, 'ID'),
 (0.11799178147535311, 'BMI'),
 (0.03301708861991399, 'EverMarried_Yes'),
 (0.03232077930790638, 'Hypertension'),
 (0.025966518131473278, 'HeartDisease'),
 (0.022589838383750067, 'Smoking_Unknown'),
 (0.01955317453877312, 'Smoking_formerly smoked'),
 (0.01887935519762196, 'EverMarried_No'),
 (0.018463502412169992, 'Residence_Rural'),
 (0.018435353107690906, 'Smoking_never smoked'),
 (0.016452114130132967, 'Work_Private'),
 (0.015918908489667233, 'Gender_Male'),
 (0.015627870414086205, 'Gender_Female'),
 (0.015168945222231282, 'Smoking_smokes'),
 (0.01497820175028991, 'Work_Self-employed'),
 (0.014660911296336598, 'Residence_Urban'),
 (0.011532210957084505, 'Work_children'),
 (0.010497399185411656, 'Work_Govt_job'),
 (1.5176599725202212e-05, 'Work_Never_worked')]

# Easy Ensemble AdaBoost Classifier

In [17]:
# Train the EasyEnsembleClassifier
easy_ensemble = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy_ensemble.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred = easy_ensemble.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7431587561374795

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[707, 468],
       [  6,  46]])

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.60      0.88      0.75      0.73      0.52      1175
          1       0.09      0.88      0.60      0.16      0.73      0.55        52

avg / total       0.95      0.61      0.87      0.72      0.73      0.52      1227

