In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [4]:
stroke_df = pd.read_csv("Resources/stroke_cleaned.csv")
stroke_df.head()

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,Work,Residence,Glucose,BMI,Smoking,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
# Create our features
X = stroke_df.drop("Stroke", axis=1)
X = pd.get_dummies(X)

# Create our target
y = stroke_df["Stroke"]

In [6]:
X.describe()

Unnamed: 0,ID,Age,Hypertension,HeartDisease,Glucose,BMI,Gender_Female,Gender_Male,EverMarried_No,EverMarried_Yes,...,Work_Never_worked,Work_Private,Work_Self-employed,Work_children,Residence_Rural,Residence_Urban,Smoking_Unknown,Smoking_formerly smoked,Smoking_never smoked,Smoking_smokes
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,...,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.868989,0.091891,0.049511,105.297402,28.89456,0.590261,0.409739,0.347188,0.652812,...,0.004482,0.572535,0.157905,0.136716,0.492665,0.507335,0.30216,0.170334,0.377343,0.150163
std,20995.468407,22.555878,0.288901,0.216954,44.42555,7.85432,0.491836,0.491836,0.476125,0.476125,...,0.066808,0.494761,0.364689,0.343582,0.499997,0.499997,0.459241,0.375964,0.484771,0.357268
min,77.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
y.value_counts()

0    4699
1     209
Name: Stroke, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Oversampling

### Naive Random Oversampling

In [9]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3524, 1: 3524})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7332078559738134

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[774, 401],
       [ 10,  42]])

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.66      0.81      0.79      0.73      0.52      1175
          1       0.09      0.81      0.66      0.17      0.73      0.54        52

avg / total       0.95      0.67      0.80      0.76      0.73      0.52      1227



In [14]:
d_1 = {"y_test": y_test,"y_pred": y_pred}
df_1 = pd.DataFrame(d_1)
df_1

Unnamed: 0,y_test,y_pred
2251,0,0
1101,0,1
4899,0,1
3586,0,0
4068,0,1
...,...,...
452,0,0
810,0,1
1776,0,0
4701,0,0


### SMOTE Oversampling

In [15]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3524, 1: 3524})

In [16]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [17]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6226104746317512

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1079,   96],
       [  35,   17]])

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.92      0.33      0.94      0.55      0.32      1175
          1       0.15      0.33      0.92      0.21      0.55      0.28        52

avg / total       0.93      0.89      0.35      0.91      0.55      0.32      1227



In [20]:
d_2 = {"y_test": y_test,"y_pred": y_pred}
df_2 = pd.DataFrame(d_2)
df_2

Unnamed: 0,y_test,y_pred
2251,0,0
1101,0,0
4899,0,0
3586,0,0
4068,0,1
...,...,...
452,0,0
810,0,0
1776,0,0
4701,0,0


# Undersampling

In [21]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 157, 1: 157})

In [22]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [23]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5565139116202946

In [24]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[178, 997],
       [  2,  50]])

In [25]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.15      0.96      0.26      0.38      0.13      1175
          1       0.05      0.96      0.15      0.09      0.38      0.16        52

avg / total       0.95      0.19      0.93      0.26      0.38      0.13      1227



In [26]:
d_3 = {"y_test": y_test,"y_pred": y_pred}
df_3 = pd.DataFrame(d_3)
df_3

Unnamed: 0,y_test,y_pred
2251,0,1
1101,0,1
4899,0,1
3586,0,1
4068,0,1
...,...,...
452,0,0
810,0,1
1776,0,1
4701,0,0


# Combination (Over and Under) Sampling

In [27]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteen = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteen.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 2535, 1: 3079})

In [28]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7337152209492634

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[730, 445],
       [  8,  44]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.62      0.85      0.76      0.73      0.51      1175
          1       0.09      0.85      0.62      0.16      0.73      0.54        52

avg / total       0.95      0.63      0.84      0.74      0.73      0.51      1227



In [32]:
d_4 = {"y_test": y_test,"y_pred": y_pred}
df_4 = pd.DataFrame(d_4)
df_4

Unnamed: 0,y_test,y_pred
2251,0,0
1101,0,1
4899,0,1
3586,0,0
4068,0,1
...,...,...
452,0,0
810,0,1
1776,0,1
4701,0,0
