In [1]:
import warnings

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [4]:
import psycopg2 as pg
import pandas.io.sql as psql
import requests
import pymysql 
from sqlalchemy import create_engine
from config import user
from config import password


db_string = f"postgresql://postgres:{password}@datastroke.c326vl9oo2i8.us-east-1.rds.amazonaws.com:5432/stroke_db"

In [5]:
engine = create_engine(db_string)
stroke_df = pd.read_sql_query("SELECT * FROM stroke_clean",engine)

In [6]:
stroke_df.head()


Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,Work,Residence,Glucose,BMI,Smoking,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,0
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,0
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,0
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,0
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,0


In [7]:
# Create our features
X = stroke_df.drop("Stroke", axis=1)
X = pd.get_dummies(X)

# Create our target
y = stroke_df["Stroke"]

In [8]:
X.describe()

Unnamed: 0,ID,Age,Hypertension,HeartDisease,Glucose,BMI,Gender_Female,Gender_Male,EverMarried_No,EverMarried_Yes,...,Work_Never_worked,Work_Private,Work_Self-employed,Work_children,Residence_Rural,Residence_Urban,Smoking_Unknown,Smoking_formerly smoked,Smoking_never smoked,Smoking_smokes
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,...,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.868989,0.091891,0.049511,105.297402,28.89456,0.590261,0.409739,0.347188,0.652812,...,0.004482,0.572535,0.157905,0.136716,0.492665,0.507335,0.30216,0.170334,0.377343,0.150163
std,20995.468407,22.555878,0.288901,0.216954,44.42555,7.85432,0.491836,0.491836,0.476125,0.476125,...,0.066808,0.494761,0.364689,0.343582,0.499997,0.499997,0.459241,0.375964,0.484771,0.357268
min,77.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y.value_counts()

1    4699
0     209
Name: Stroke, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Oversampling

### Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 3524, 0: 3524})

In [12]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7246972176759411

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 42,  10],
       [421, 754]])

In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.09      0.81      0.64      0.16      0.72      0.53        52
          1       0.99      0.64      0.81      0.78      0.72      0.51      1175

avg / total       0.95      0.65      0.80      0.75      0.72      0.51      1227



In [16]:
d_1 = {"y_test": y_test,"y_pred": y_pred}
df_1 = pd.DataFrame(d_1)
df_1

Unnamed: 0,y_test,y_pred
1555,1,0
287,1,1
3645,1,1
293,1,1
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1


### SMOTE Oversampling

In [17]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 3524, 0: 3524})

In [18]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5892553191489363

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  13,   39],
       [  84, 1091]])

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.13      0.25      0.93      0.17      0.48      0.22        52
          1       0.97      0.93      0.25      0.95      0.48      0.25      1175

avg / total       0.93      0.90      0.28      0.91      0.48      0.25      1227



In [22]:
d_2 = {"y_test": y_test,"y_pred": y_pred}
df_2 = pd.DataFrame(d_2)
df_2

Unnamed: 0,y_test,y_pred
1555,1,0
287,1,1
3645,1,1
293,1,1
3308,1,1
...,...,...
924,1,1
3728,1,1
3651,1,1
3391,1,1


# Undersampling

In [23]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 157, 1: 157})

In [24]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [25]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.70069558101473

In [26]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 43,   9],
       [500, 675]])

In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.08      0.83      0.57      0.14      0.69      0.49        52
          1       0.99      0.57      0.83      0.73      0.69      0.46      1175

avg / total       0.95      0.59      0.82      0.70      0.69      0.46      1227



In [28]:
d_3 = {"y_test": y_test,"y_pred": y_pred}
df_3 = pd.DataFrame(d_3)
df_3

Unnamed: 0,y_test,y_pred
1555,1,0
287,1,1
3645,1,1
293,1,1
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1


# Combination (Over and Under) Sampling

In [29]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteen = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteen.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 3079, 1: 2535})

In [30]:
# Train the Logistic Regression model using the resampled data
Log_model = LogisticRegression(solver='lbfgs', random_state=1)
Log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [31]:
# Calculated the balanced accuracy score
y_pred = Log_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7110801963993454

In [32]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 42,  10],
       [453, 722]])

In [33]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.08      0.81      0.61      0.15      0.70      0.51        52
          1       0.99      0.61      0.81      0.76      0.70      0.49      1175

avg / total       0.95      0.62      0.80      0.73      0.70      0.49      1227



In [34]:
d_4 = {"y_test": y_test,"y_pred": y_pred}
df_4 = pd.DataFrame(d_4)
df_4

Unnamed: 0,y_test,y_pred
1555,1,0
287,1,1
3645,1,1
293,1,1
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1
