## 2. Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set pandas options to manage future warnings about downcasting
pd.set_option('future.no_silent_downcasting', True)

## 3. Data Loading and Inspection

### Loading

In [None]:
health = pd.read_csv('./data/raw/framingham.csv')
health.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


## 4. Data Cleaning

### Handling Missing Data

In [None]:
# Express the missing values as percentages
missing_values = health.isna().mean() *100

# Reduce it to only columns with missing values
missing_values = missing_values[missing_values > 0]
missing_values

education     2.476415
cigsPerDay    0.683962
BPMeds        1.250000
totChol       1.179245
BMI           0.448113
heartRate     0.023585
glucose       9.150943
dtype: float64

Quick test

In [None]:
# Current shape
print('Current shape of dataset:', health.shape)

# Make a copy and drop all missing values
health_copy = health.copy().dropna()
print('New shape:', health_copy.shape)

# How much of the data are we losing
print(f'We are losing {round(((health.shape[0] - health_copy.shape[0])/health.shape[0])*100, 2)}% of the original data')

Current shape of dataset: (4240, 16)
New shape: (3658, 16)
We are losing 13.73% of the original data


Dropping is not an option then

### Imputation

Numerical columns:

We will employ median imputation since the median is less sensitive compared to the mean considering that extreme values could skew the mean

In [None]:
# List of columns to impute with the median
median_columns = ['cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']

# Impute missing values with the median for each column
for column in median_columns:
    health[column] = health[column].fillna(health[column].median())

Categorical and Binary columns:

We will employ the Mode imputation since it represents the most common category

In [None]:
# List of columns to impute with the median
mode_columns = ['education', 'BPMeds']

# Impute missing values with the median for each column
for column in mode_columns:
    health[column] = health[column].fillna(health[column].mode()[0])

In [None]:
health.isna().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Split data into train and test sets
X = health.drop('TenYearCHD', axis=1)
y = health['TenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_logreg))

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting Accuracy: ", accuracy_score(y_test, y_pred_gb))

In [None]:
from sklearn.metrics import roc_auc_score

# Model evaluation using ROC-AUC
logreg_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
gb_roc_auc = roc_auc_score(y_test, gb.predict(X_test))

print(f"Logistic Regression AUC-ROC: {logreg_roc_auc}")
print(f"Gradient Boosting AUC-ROC: {gb_roc_auc}")

In [None]:
import streamlit as st

st.title('CHD Risk Prediction')

# User input
age = st.number_input('Age', min_value=20, max_value=80)
totChol = st.number_input('Total Cholesterol')
sysBP = st.number_input('Systolic BP')
diaBP = st.number_input('Diastolic BP')
BMI = st.number_input('BMI')

# Predict button
if st.button('Predict'):
    user_data = [[age, totChol, sysBP, diaBP, BMI]]
    prediction = gb.predict(user_data)
    if prediction == 1:
        st.write("High risk of CHD in 10 years")
    else:
        st.write("Low risk of CHD in 10 years")

In [None]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load model
model = pickle.load(open('chd_model.pkl', 'rb'))

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    prediction = model.predict([list(data.values())])
    return jsonify(prediction=prediction[0])

if __name__ == '__main__':
    app.run(debug=True)