In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier

In [4]:
import psycopg2 as pg
import pandas.io.sql as psql
import requests
import pymysql 
from sqlalchemy import create_engine
from config import user
from config import password


db_string = f"postgresql://postgres:{password}@datastroke.c326vl9oo2i8.us-east-1.rds.amazonaws.com:5432/stroke_db"

In [5]:
engine = create_engine(db_string)
stroke_df = pd.read_sql_query("SELECT * FROM stroke_clean",engine)
stroke_df.head()

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,Work,Residence,Glucose,BMI,Smoking,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,0
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,0
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,0
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,0
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,0


# Split data into Training and Testing 

In [6]:
# Create our features
X = stroke_df.drop("Stroke", axis=1)
X = pd.get_dummies(X)

# Create our target
y = stroke_df['Stroke']

In [7]:
X.describe()

Unnamed: 0,ID,Age,Hypertension,HeartDisease,Glucose,BMI,Gender_Female,Gender_Male,EverMarried_No,EverMarried_Yes,...,Work_Never_worked,Work_Private,Work_Self-employed,Work_children,Residence_Rural,Residence_Urban,Smoking_Unknown,Smoking_formerly smoked,Smoking_never smoked,Smoking_smokes
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,...,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.868989,0.091891,0.049511,105.297402,28.89456,0.590261,0.409739,0.347188,0.652812,...,0.004482,0.572535,0.157905,0.136716,0.492665,0.507335,0.30216,0.170334,0.377343,0.150163
std,20995.468407,22.555878,0.288901,0.216954,44.42555,7.85432,0.491836,0.491836,0.476125,0.476125,...,0.066808,0.494761,0.364689,0.343582,0.499997,0.499997,0.459241,0.375964,0.484771,0.357268
min,77.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y.value_counts()

1    4699
0     209
Name: Stroke, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
Balance_random_forest = BalancedRandomForestClassifier(n_estimators = 100)
Balance_random_forest.fit(X_train, y_train)

BalancedRandomForestClassifier()

In [11]:
# Calculated the balanced accuracy score
y_pred = Balance_random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7440998363338789

In [12]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 43,   9],
       [398, 777]])

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.10      0.83      0.66      0.17      0.74      0.56        52
          1       0.99      0.66      0.83      0.79      0.74      0.54      1175

avg / total       0.95      0.67      0.82      0.77      0.74      0.54      1227



In [14]:
# List the features sorted in descending order by feature importance
featureNames = X.columns
sorted(zip(Balance_random_forest.feature_importances_, X.columns), reverse=True)

[(0.30464567760637856, 'Age'),
 (0.17362019289794026, 'Glucose'),
 (0.1255452301766357, 'BMI'),
 (0.1126636535881494, 'ID'),
 (0.031155420778673126, 'Hypertension'),
 (0.023668388224207998, 'HeartDisease'),
 (0.02116598280934506, 'EverMarried_Yes'),
 (0.020666844070829454, 'EverMarried_No'),
 (0.01989575338394425, 'Work_Self-employed'),
 (0.019016188402209065, 'Smoking_never smoked'),
 (0.01879007264036255, 'Residence_Rural'),
 (0.0179654202152965, 'Gender_Male'),
 (0.017220804873498665, 'Smoking_formerly smoked'),
 (0.0155746309062283, 'Residence_Urban'),
 (0.015574211977378785, 'Work_Private'),
 (0.013785458961536927, 'Gender_Female'),
 (0.01333089579845502, 'Work_Govt_job'),
 (0.013316005554175727, 'Smoking_Unknown'),
 (0.012348304670880312, 'Smoking_smokes'),
 (0.009771243946154874, 'Work_children'),
 (0.0002796185177195013, 'Work_Never_worked')]

In [26]:
prediction_results = {"y_test": y_test,"y_pred": y_pred}
prediction_results = pd.DataFrame(prediction_results)
prediction_results

Unnamed: 0_level_0,y_test,y_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1555,1,1
287,1,1
3645,1,1
293,1,0
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1


In [27]:
prediction_results.index.name = 'id'
prediction_results

Unnamed: 0_level_0,y_test,y_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1555,1,1
287,1,1
3645,1,1
293,1,0
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1


In [30]:
new_path = "Resources/prediction_results.csv"
prediction_results.to_csv(new_path, index=True)

In [28]:
prediction_results.dtypes

y_test    int64
y_pred    int64
dtype: object

In [29]:
prediction_results.to_sql(name='prediction_results', con=engine, if_exists='replace')

# Easy Ensemble AdaBoost Classifier

In [19]:
# Train the EasyEnsembleClassifier
easy_ensemble = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy_ensemble.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = easy_ensemble.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7371194762684125

In [21]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 44,   8],
       [437, 738]])

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.09      0.85      0.63      0.17      0.73      0.54        52
          1       0.99      0.63      0.85      0.77      0.73      0.52      1175

avg / total       0.95      0.64      0.84      0.74      0.73      0.52      1227



In [23]:
d_2 = {"y_test": y_test,"y_pred": y_pred}
df_2 = pd.DataFrame(d_2)
df_2

Unnamed: 0_level_0,y_test,y_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1555,1,1
287,1,1
3645,1,1
293,1,0
3308,1,1
...,...,...
924,1,1
3728,1,0
3651,1,1
3391,1,1
