# Heart Disease Patients Classification with a Web App
###### METIS Data Science and Machine Learning Bootcamp 2022 by Krystian Krystkowiak
###### project/month(7/7) focus: ENGENEERING

ABSTRACT

- Developed a pipeline for data processing and deployment, using a SQL database and deploying the resulting model through Github and Streamlit to create a web application for exploring data and predicting heart disease risk.

### moving data: csv -> SQL -> pandas -> pickles

In [23]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float

import imblearn.under_sampling
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import pickle

In [3]:
#importing data from Kaggle csv file to a dataframe
df = pd.read_csv('heart_2020.csv')  

In [4]:
#innitiate a sql database engine
engine = create_engine('sqlite:///heart.db', echo=False)

In [5]:
# create a table in the database

meta = MetaData()

my_table = Table(
    'my_table', meta, 
    Column('HeartDisease', Integer, primary_key = True),
    Column('BMI', Float),
    Column('Smoking',String),
    Column('AlcoholDrinking',String),
    Column('Stroke',String),
    Column('PhysicalHealth', Float),
    Column('MentalHealth', Float),
    Column('DiffWalking',String),
    Column('Sex',String),
    Column('AgeCategory',String),
    Column('Race',String),
    Column('Diabetic',String),
    Column('PhysicalActivity',String),
    Column('GenHealth',String),
    Column('SleepTime', Float),
    Column('Asthma',String),
    Column('KidneyDisease',String),
    Column('SkinCancer',String)
#    Column('date', Date)
)

In [6]:
# create the table
df.to_sql('my_table', con=engine, if_exists='append')
engine.execute("SELECT * FROM my_table LIMIT(5)").fetchall()

[(0, 'No', 16.6, 'Yes', 'No', 'No', 3.0, 30.0, 'No', 'Female', '55-59', 'White', 'Yes', 'Yes', 'Very good', 5.0, 'Yes', 'No', 'Yes'),
 (1, 'No', 20.34, 'No', 'No', 'Yes', 0.0, 0.0, 'No', 'Female', '80 or older', 'White', 'No', 'Yes', 'Very good', 7.0, 'No', 'No', 'No'),
 (2, 'No', 26.58, 'Yes', 'No', 'No', 20.0, 30.0, 'No', 'Male', '65-69', 'White', 'Yes', 'Yes', 'Fair', 8.0, 'Yes', 'No', 'No'),
 (3, 'No', 24.21, 'No', 'No', 'No', 0.0, 0.0, 'No', 'Female', '75-79', 'White', 'No', 'No', 'Good', 6.0, 'No', 'No', 'Yes'),
 (4, 'No', 23.71, 'No', 'No', 'No', 28.0, 0.0, 'Yes', 'Female', '40-44', 'White', 'No', 'Yes', 'Very good', 8.0, 'No', 'No', 'No')]

In [7]:
#comment out - to drop the table
#engine.execute("DROP TABLE my_table")

In [8]:
df_raw = pd.read_sql(
    "SELECT * FROM my_table",
    con=engine, index_col='index'
)

df_raw.head()

Unnamed: 0_level_0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [9]:
#df_sql = df_raw.sample(frac = 0.7) - experiments with sampling
df_raw.to_csv('out.csv', compression='gzip')

### data preparation, modelling and pickling

Using Naive Bayes model from previous project (Heart Disease Patients Classification) with test set recall of 0.90 and accuracy of 0.81. Small size of the model was also taken into consideration.

In [11]:
#data preparation:

#dummy variables
df = pd.get_dummies(df_raw, columns=['AgeCategory','Race','Diabetic', 'GenHealth'])

#replace Yes/No with 1/0
df =  df[df.columns].replace({'Yes':1, 'No':0, 'Female':1,'Male':0 })

df = df.drop(columns=['PhysicalHealth', 'MentalHealth'])

In [12]:
#Select Features
X = df.drop(columns =['HeartDisease'], axis = 1)

#Select Target 
y = df['HeartDisease']

# Set Training and Testing Data
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, shuffle = True, test_size = .15, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, shuffle = True, test_size = .15, random_state = 42)

Shape of training feature: (231051, 39)
Shape of validation feature: (40774, 39)
Shape of testing feature: (47970, 39)
Shape of training label: (231051,)
Shape of validation label: (40774,)
Shape of training label: (47970,)


In [13]:
#to address class imbalance and since I have a lot of data, I undersample the majority class
RUS = imblearn.under_sampling.RandomUnderSampler(sampling_strategy = 0.5, random_state=42)

X_tr_us, y_tr_us = RUS.fit_resample(X_train, y_train)

In [19]:
#scaling
std_scale = StandardScaler()

X_train_scaled = std_scale.fit_transform(X_train)
X_val_scaled = std_scale.fit_transform(X_val)
X_test_scaled = std_scale.fit_transform(X_test)
X_tr_us_scaled = std_scale.fit_transform(X_tr_us)

In [33]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB() 
nb.fit(X_tr_us_scaled, y_tr_us)

BernoulliNB()

In [34]:
#pickling the model for use in the app
pickle.dump(nb, open('model.pkl', 'wb'))

In [26]:
#pickling the model for use in the app
pickle.dump(randomforest, open('model.pkl', 'wb'))

In [27]:
#pickling scaler for data transformation inside the app
std_scale.fit(X_train)
pickle.dump(std_scale, open('scaled.pkl', 'wb'))