# Predicting Heart Failure
## Machine Learning with Logistic Regression

### Import Modules

In [29]:
import pandas as pd
import numpy as np
import plotly.express as px
%matplotlib widget
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from pandas.api.types import CategoricalDtype
import warnings
import seaborn as sn
plt.style.use('dark_background')
sn.set_style('darkgrid')
#warnings.filterwarnings("ignore")

### Load Data and Perform EDA

In [30]:
raw_df = pd.read_csv('data/heart_disease_health_indicators_BRFSS2015.csv')


In [31]:
raw_df.head(15)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0.0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1.0,1.0,1.0,1.0,30.0,1.0,0.0,2.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


In [32]:
raw_df.shape

(253680, 22)

In [33]:
raw_df.isna().sum()

HeartDiseaseorAttack    0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
Diabetes                0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [34]:
heartattack_counts = raw_df['HeartDiseaseorAttack'].value_counts()
rename_labels = {
    0: 'No Heart Disease',
    1: 'Heart Disease'}
fig = px.pie(values=heartattack_counts,
             names=heartattack_counts.index.map(rename_labels),
             title='Distribution of Heart Diagnosis',
             labels=rename_labels,
             hover_name=heartattack_counts.index.map(rename_labels))
fig.update_traces(textinfo='percent+label', text=heartattack_counts.index.map(rename_labels))
fig.show()

In [35]:
categories = ['HighBP', 'Smoker', 'Stroke', 'Diabetes', 'HvyAlcoholConsump']

rename_labels = {
    0: 'No',
    1: 'Yes'
}

for category in categories:
    category_counts = raw_df[category].value_counts()

    fig = px.pie(values=category_counts,
                 names=category_counts.index.map(rename_labels),
                 title=f'Distribution of {category}',
                 labels=rename_labels,
                 hover_name=category_counts.index.map(rename_labels))

    fig.update_traces(textinfo='percent+label', text=category_counts.index.map(rename_labels))
    fig.show()

In [36]:
raw_df.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

### Creating Training, Testing, and Validation Sets

In [37]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (152208, 22)
val_df.shape : (50736, 22)
test_df.shape : (50736, 22)


In [38]:
train_df

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
132776,0.0,0.0,0.0,1.0,25.0,1.0,0.0,2.0,1.0,1.0,...,1.0,0.0,2.0,0.0,1.0,0.0,0.0,8.0,5.0,8.0
60629,0.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,7.0
163859,0.0,0.0,0.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,6.0,8.0
179387,0.0,0.0,0.0,1.0,31.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,6.0,8.0
6258,0.0,0.0,0.0,1.0,21.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153576,0.0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,0.0,4.0,6.0,8.0
187540,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0
158320,0.0,0.0,0.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0
185003,0.0,1.0,0.0,1.0,32.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,1.0,1.0,9.0,6.0,8.0


In [39]:
val_df

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
177961,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,4.0,5.0
105626,1.0,1.0,1.0,1.0,27.0,1.0,1.0,2.0,0.0,1.0,...,1.0,0.0,4.0,8.0,20.0,1.0,0.0,10.0,4.0,2.0
136759,0.0,0.0,1.0,1.0,47.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,3.0,0.0,1.0,1.0,5.0,6.0,5.0
181637,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0
245214,0.0,0.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,1.0,1.0,8.0,6.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250516,0.0,1.0,0.0,1.0,29.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,3.0,0.0,1.0,8.0,6.0,7.0
161301,0.0,1.0,1.0,1.0,22.0,0.0,0.0,2.0,1.0,1.0,...,1.0,0.0,2.0,2.0,2.0,0.0,0.0,9.0,5.0,1.0
31718,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,1.0,0.0,0.0,8.0,6.0,7.0
152320,0.0,0.0,1.0,1.0,26.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,9.0,5.0,7.0


In [40]:
test_df

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
219620,0.0,0.0,0.0,1.0,21.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,3.0,7.0,0.0,0.0,7.0,4.0,2.0
132821,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,13.0,6.0,6.0
151862,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,4.0,7.0
139717,0.0,0.0,0.0,1.0,27.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,3.0,0.0,0.0,1.0,2.0,4.0,7.0
239235,0.0,0.0,1.0,1.0,31.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,4.0,27.0,27.0,1.0,0.0,8.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169513,0.0,1.0,0.0,1.0,29.0,1.0,0.0,2.0,1.0,1.0,...,1.0,0.0,3.0,0.0,10.0,0.0,0.0,9.0,6.0,7.0
182415,0.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1.0,10.0,0.0,0.0,10.0,5.0,8.0
109739,0.0,0.0,1.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,3.0,0.0,0.0,1.0,6.0,6.0,8.0
181671,0.0,0.0,0.0,1.0,24.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,4.0,0.0,0.0,0.0,1.0,13.0,4.0,5.0


### Identifying Features and Targets

In [56]:
input_cols = list(train_df.columns)[1:]
target_col = 'HeartDiseaseorAttack'
print(input_cols)
target_col

['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']


'HeartDiseaseorAttack'

### Copying Inputs and Targets for Further Processing

In [60]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()
train_inputs

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
132776,0.0,0.0,1.0,25.0,1.0,0.0,2.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,1.0,0.0,0.0,8.0,5.0,8.0
60629,0.0,0.0,1.0,25.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,7.0
163859,0.0,0.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,6.0,8.0
179387,0.0,0.0,1.0,31.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,6.0,8.0
6258,0.0,0.0,1.0,21.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153576,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,0.0,4.0,6.0,8.0
187540,0.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0
158320,0.0,0.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0
185003,1.0,0.0,1.0,32.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,1.0,1.0,9.0,6.0,8.0


In [64]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
train_inputs[numeric_cols].describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,...,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0
mean,0.428295,0.424557,0.963235,28.379566,0.442802,0.040786,0.298171,0.756813,0.634868,0.810628,...,0.95106,0.084187,2.511471,3.192598,4.238266,0.167422,0.440555,8.030688,5.050891,6.058847
std,0.494833,0.494277,0.188186,6.61446,0.496719,0.197795,0.699485,0.429008,0.481469,0.391805,...,0.215743,0.27767,1.067363,7.42191,8.703617,0.373354,0.496455,3.05262,0.987088,2.069661
min,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,98.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


### Checking if Feature Scaling is Necessary

In [65]:
raw_df[numeric_cols].describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.296921,0.756544,0.634256,0.81142,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.69816,0.429169,0.481639,0.391175,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,98.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


Scaling numeric features to smaller ranges

In [44]:
scaler = MinMaxScaler()

In [66]:
scaler.fit(raw_df[numeric_cols])

Inspecting min and max values for each column

In [46]:
print('Minimum:')
list(scaler.data_min_)

Minimum:


[0.0,
 0.0,
 0.0,
 0.0,
 12.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0]

In [47]:
print('Maximum:')
list(scaler.data_max_)

Maximum:


[1.0,
 1.0,
 1.0,
 1.0,
 98.0,
 1.0,
 1.0,
 2.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 5.0,
 30.0,
 30.0,
 1.0,
 1.0,
 13.0,
 6.0,
 8.0]

### Scaling the Train, Test and Val sets with scaler.transform

In [58]:
train_inputs.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'Diabetes', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
       'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth',
       'DiffWalk', 'Sex', 'Age', 'Education'],
      dtype='object')

In [68]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])
train_inputs[numeric_cols].describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,...,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0,152208.0
mean,0.428295,0.424557,0.963235,-0.13732,0.442802,0.040786,0.074543,0.756813,0.634868,0.810628,...,0.95106,0.084187,-0.155533,0.003547,0.004709,0.167422,0.440555,-0.03450911,-0.03796436,-0.039615
std,0.494833,0.494277,0.188186,0.000894,0.496719,0.197795,0.174871,0.429008,0.481469,0.391805,...,0.215743,0.27767,0.06671,0.008247,0.009671,0.373354,0.496455,0.02119875,0.03948352,0.042238
min,0.0,0.0,0.0,-0.139535,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.25,0.0,0.0,0.0,0.0,-0.08333333,-0.2,-0.142857
25%,0.0,0.0,1.0,-0.137912,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,-0.1875,0.0,0.0,0.0,0.0,-0.04861111,-0.08,-0.061224
50%,0.0,0.0,1.0,-0.137507,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,-0.1875,0.0,0.0,0.0,0.0,-0.03472222,-0.04,-0.020408
75%,1.0,1.0,1.0,-0.136966,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,-0.125,0.002222,0.003333,0.0,1.0,-0.02083333,5.5511150000000004e-17,0.0
max,1.0,1.0,1.0,-0.127907,1.0,1.0,0.5,1.0,1.0,1.0,...,1.0,1.0,0.0,0.033333,0.033333,1.0,1.0,-1.387779e-17,5.5511150000000004e-17,0.0
