# Sampling

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter

In [3]:
# Load the data
file_path = Path("resources/processed_stroke_db_w_cat.csv")
df = pd.read_csv(file_path, index_col=0)
df

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


### One-hot Encoding for Categotical Data & StandardScaler for Numerical Data

In [4]:
# Generate our categorical variable list
cat_labels= df.dtypes[df.dtypes == "object"].index.tolist()
cat_labels

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [5]:
# Check the number of unique values in each column
df[cat_labels].nunique()

gender            3
ever_married      2
work_type         5
Residence_type    2
smoking_status    4
dtype: int64

In [6]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [8]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat_labels]))

In [9]:
# Set the indexes of encode_df and df as the same
encode_df.index=df.index

In [10]:
# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat_labels)
encode_df.head()

Unnamed: 0_level_0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9046,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
31112,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
60182,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1665,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
56669,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [11]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat_labels,1)
df.head()

Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9046,67.0,0,1,228.69,36.6,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
31112,80.0,0,1,105.92,32.5,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
60182,49.0,0,0,171.23,34.4,1,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1665,79.0,1,0,174.12,24.0,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
56669,81.0,0,0,186.21,29.0,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [12]:
df.groupby('stroke').count()

Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4700,4700,4700,4700,4700,4700,4700,4700,4700,4700,...,4700,4700,4700,4700,4700,4700,4700,4700,4700,4700
1,209,209,209,209,209,209,209,209,209,209,...,209,209,209,209,209,209,209,209,209,209


In [13]:
# Split our preprocessed data into our features and target arrays
y = df["stroke"].values
X = df.drop(["stroke"],1).values

In [14]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, stratify=y)

In [15]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [16]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

In [17]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Oversampling

### Naive Random

In [18]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
# Instantiate the model
ros = RandomOverSampler(random_state=1)
# Resample the targets
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3524, 1: 3524})

In [19]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)

# Fit
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
from sklearn.metrics import balanced_accuracy_score
bas = balanced_accuracy_score(y_test, y_pred)
bas

0.7540881737310309

In [21]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"], columns=["Predicted No Stroke", "Predicted Stroke"])
cm_df

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,869,307
Actual Stroke,12,40


In [22]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.77      0.84      0.75      0.57      1176
          1       0.12      0.77      0.74      0.20      0.75      0.57        52

avg / total       0.95      0.74      0.77      0.82      0.75      0.57      1228



In [23]:
print("Naive Random Oversampling")
print(f"Balanced Accuracy Score: {bas}")
print(cm_df)
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling
Balanced Accuracy Score: 0.7540881737310309
                  Predicted No Stroke  Predicted Stroke
Actual No Stroke                  869               307
Actual Stroke                      12                40
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.77      0.84      0.75      0.57      1176
          1       0.12      0.77      0.74      0.20      0.75      0.57        52

avg / total       0.95      0.74      0.77      0.82      0.75      0.57      1228



### SMOTE

In [24]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
# Instantiate the model
smote = SMOTE(random_state=1)
# Resample the targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3524, 1: 3524})

In [25]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
bas

0.7446363160648874

In [27]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"], columns=["Predicted No Stroke", "Predicted Stroke"])
cm_df

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,892,284
Actual Stroke,14,38


In [28]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.76      0.73      0.86      0.74      0.56      1176
          1       0.12      0.73      0.76      0.20      0.74      0.55        52

avg / total       0.95      0.76      0.73      0.83      0.74      0.56      1228



In [29]:
print("SMOTE Oversampling")
print(f"Balanced Accuracy Score: {bas}")
print(cm_df)
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Oversampling
Balanced Accuracy Score: 0.7446363160648874
                  Predicted No Stroke  Predicted Stroke
Actual No Stroke                  892               284
Actual Stroke                      14                38
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.76      0.73      0.86      0.74      0.56      1176
          1       0.12      0.73      0.76      0.20      0.74      0.55        52

avg / total       0.95      0.76      0.73      0.83      0.74      0.56      1228



### ADASYN

In [30]:
# Resample the training data with ADASYN
from imblearn.over_sampling import ADASYN
# Instantiate the model
smote = ADASYN(random_state=1)
# Resample the targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3524, 1: 3512})

In [31]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [32]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
bas

0.7724686028257457

In [33]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"], columns=["Predicted No Stroke", "Predicted Stroke"])
cm_df

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,867,309
Actual Stroke,10,42


In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.81      0.84      0.77      0.59      1176
          1       0.12      0.81      0.74      0.21      0.77      0.60        52

avg / total       0.95      0.74      0.80      0.82      0.77      0.59      1228



In [35]:
print("ADASYN Oversampling")
print(f"Balanced Accuracy Score: {bas}")
print(cm_df)
print(classification_report_imbalanced(y_test, y_pred))

ADASYN Oversampling
Balanced Accuracy Score: 0.7724686028257457
                  Predicted No Stroke  Predicted Stroke
Actual No Stroke                  867               309
Actual Stroke                      10                42
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.81      0.84      0.77      0.59      1176
          1       0.12      0.81      0.74      0.21      0.77      0.60        52

avg / total       0.95      0.74      0.80      0.82      0.77      0.59      1228



## Undersampling

### Cluster Centroids

In [36]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
# Instantiate
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 157, 1: 157})

In [37]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
bas

0.7243916797488226

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"], columns=["Predicted No Stroke", "Predicted Stroke"])
cm_df

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,867,309
Actual Stroke,15,37


In [40]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.74      0.71      0.84      0.72      0.53      1176
          1       0.11      0.71      0.74      0.19      0.72      0.52        52

avg / total       0.95      0.74      0.71      0.81      0.72      0.53      1228



In [41]:
print("Cluster Centroids Undersampling")
print(f"Balanced Accuracy Score: {bas}")
print(cm_df)
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Undersampling
Balanced Accuracy Score: 0.7243916797488226
                  Predicted No Stroke  Predicted Stroke
Actual No Stroke                  867               309
Actual Stroke                      15                37
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.74      0.71      0.84      0.72      0.53      1176
          1       0.11      0.71      0.74      0.19      0.72      0.52        52

avg / total       0.95      0.74      0.71      0.81      0.72      0.53      1228



## Combination (Over and Under) Sampling

### SMOTEEN

In [42]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 2759, 1: 3370})

In [43]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [44]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
bas

0.763376504447933

In [45]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"], columns=["Predicted No Stroke", "Predicted Stroke"])
cm_df

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,823,353
Actual Stroke,9,43


In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.70      0.83      0.82      0.76      0.57      1176
          1       0.11      0.83      0.70      0.19      0.76      0.59        52

avg / total       0.95      0.71      0.82      0.79      0.76      0.57      1228



In [47]:
print("SMOTEENN")
print(f"Balanced Accuracy Score: {bas}")
print(cm_df)
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN
Balanced Accuracy Score: 0.763376504447933
                  Predicted No Stroke  Predicted Stroke
Actual No Stroke                  823               353
Actual Stroke                       9                43
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.70      0.83      0.82      0.76      0.57      1176
          1       0.11      0.83      0.70      0.19      0.76      0.59        52

avg / total       0.95      0.71      0.82      0.79      0.76      0.57      1228

