In [239]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import OneHotEncoder 

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

pd.options.display.max_columns = None 

In [240]:
patients = {'education': [1.0, 3.0, 2.0, 4.0, 2.0, 4.0, 3.0, 1.0, 1.0, 3.0],
        'sex': ['F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'M'],
        'age_category': ['Senior Citizens', 'Middle-Aged Adults', 'Young Adults', 'Senior Citizens', 'Middle-Aged Adults', 'Senior Citizens', 'Middle-Aged Adults', 'Young Adults', 'Middle-Aged Adults', 'Senior Citizens'],
        'BMI_category': ['Obesity', 'Underweight', 'Normal Weight', 'Overweight', 'Normal Weight', 'Obesity', 'Overweight', 'Normal Weight', 'Obesity', 'Normal Weight'],
        'is_smoking': ['YES', 'YES', 'NO', 'NO', 'NO', 'YES', 'NO', 'YES', 'NO', 'NO'],
        'BPMeds': [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
        'prevalentStroke': [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        'prevalentHyp': [1, 0, 1, 0, 0, 1, 1, 0, 1, 0],
        'diabetes': [1, 1, 0, 0, 0, 1, 0, 1, 0, 0],
        'age': [63, 58, 32, 70, 39, 69, 55, 34, 52, 65],
        'cigsPerDay': [40.0, 27.0, 0.0, 0.0, 0.0, 25.0, 0.0, 15.0, 0.0, 0.0],
        'totChol': [282.0, 245.0, 130.0, 199.0, 160.0, 220.0, 178.0, 110.0, 210.0, 170.0],
        'sysBP': [158.0, 124.5, 140.0, 127.0, 110.0, 149.0, 152.0, 119.0, 131.0, 116.0], 
        'BMI': [41.0, 17.5, 22.5, 28.0, 24.0, 34.0, 29.0, 23.0, 31.0, 24.5],
        'heartRate': [105, 99, 79, 92, 76, 94, 110, 75, 83, 77],
        'glucose': [192.0, 232.0, 110, 92, 82, 156.0, 94.0, 130.0, 112.0, 89.0]}

In [241]:
patients_df = pd.DataFrame (patients)
patients_df 

Unnamed: 0,education,sex,age_category,BMI_category,is_smoking,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,1.0,F,Senior Citizens,Obesity,YES,1.0,1,1,1,63,40.0,282.0,158.0,41.0,105,192.0
1,3.0,M,Middle-Aged Adults,Underweight,YES,0.0,0,0,1,58,27.0,245.0,124.5,17.5,99,232.0
2,2.0,M,Young Adults,Normal Weight,NO,1.0,0,1,0,32,0.0,130.0,140.0,22.5,79,110.0
3,4.0,F,Senior Citizens,Overweight,NO,0.0,0,0,0,70,0.0,199.0,127.0,28.0,92,92.0
4,2.0,M,Middle-Aged Adults,Normal Weight,NO,0.0,0,0,0,39,0.0,160.0,110.0,24.0,76,82.0
5,4.0,F,Senior Citizens,Obesity,YES,1.0,0,1,1,69,25.0,220.0,149.0,34.0,94,156.0
6,3.0,M,Middle-Aged Adults,Overweight,NO,0.0,1,1,0,55,0.0,178.0,152.0,29.0,110,94.0
7,1.0,F,Young Adults,Normal Weight,YES,0.0,0,0,1,34,15.0,110.0,119.0,23.0,75,130.0
8,1.0,F,Middle-Aged Adults,Obesity,NO,1.0,0,1,0,52,0.0,210.0,131.0,31.0,83,112.0
9,3.0,M,Senior Citizens,Normal Weight,NO,0.0,0,0,0,65,0.0,170.0,116.0,24.5,77,89.0


In [242]:
categorical_list = ['education', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'sex', 'is_smoking']

for col in categorical_list:
    patients_df [col] = patients_df [col].astype('category')

In [243]:
patients_df ['BMI_category'] = pd.cut(patients_df['BMI'], bins=[0, 18.5, 24.9, 29.9, float('inf')], labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity'])
patients_df ['age_category'] = pd.cut(patients_df['age'], bins=[0, 17, 35, 59, float('inf')], labels=['Minors', 'Young Adults', 'Middle-Aged Adults', 'Senior Citizens'])

In [244]:
new_order_columns = ['education', 'sex', 'age_category', 'BMI_category', 'is_smoking', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes',
                    'age', 'cigsPerDay', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']

In [245]:
patients_df  = patients_df [new_order_columns]

In [246]:
patients_df_num = patients_df.select_dtypes(include = np.number)
patients_df_cat = patients_df.select_dtypes(include = ['category'])

##### SCALING

In [247]:
with open('../data/predict_objects/robust_scaler.pkl', mode = "rb") as robust_scaler:
    scaler = pickle.load(robust_scaler)

In [248]:
num_scaler = pd.DataFrame(scaler.transform(patients_df_num), columns = patients_df_num.columns)
num_scaler

Unnamed: 0,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,1.0,2.0,0.827586,1.092593,3.144439,2.0,8.142857
1,0.642857,1.35,0.189655,-0.148148,-1.586311,1.6,11.0
2,-1.214286,0.0,-1.793103,0.425926,-0.579768,0.266667,2.285714
3,1.5,0.0,-0.603448,-0.055556,0.527428,1.133333,1.0
4,-0.714286,0.0,-1.275862,-0.685185,-0.277806,0.066667,0.285714
5,1.428571,1.25,-0.241379,0.759259,1.735279,1.266667,5.571429
6,0.428571,0.0,-0.965517,0.87037,0.728737,2.333333,1.142857
7,-1.071429,0.75,-2.137931,-0.351852,-0.479114,0.0,3.714286
8,0.214286,0.0,-0.413793,0.092593,1.131354,0.533333,2.428571
9,1.142857,0.0,-1.103448,-0.462963,-0.177151,0.133333,0.785714


In [249]:
patients_df[num_scaler.columns] = num_scaler

##### ENCODING

CATEGORIES WITHOUT A SPECIFIC ORDER

In [250]:
with open('../data/predict_objects/onehotencoder.pkl', mode = "rb") as onehotencoder:
    oh_encoding = pickle.load(onehotencoder)

In [251]:
patients_df

Unnamed: 0,education,sex,age_category,BMI_category,is_smoking,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,1.0,F,Senior Citizens,Obesity,YES,1.0,1,1,1,1.0,2.0,0.827586,1.092593,3.144439,2.0,8.142857
1,3.0,M,Middle-Aged Adults,Underweight,YES,0.0,0,0,1,0.642857,1.35,0.189655,-0.148148,-1.586311,1.6,11.0
2,2.0,M,Young Adults,Normal Weight,NO,1.0,0,1,0,-1.214286,0.0,-1.793103,0.425926,-0.579768,0.266667,2.285714
3,4.0,F,Senior Citizens,Overweight,NO,0.0,0,0,0,1.5,0.0,-0.603448,-0.055556,0.527428,1.133333,1.0
4,2.0,M,Middle-Aged Adults,Normal Weight,NO,0.0,0,0,0,-0.714286,0.0,-1.275862,-0.685185,-0.277806,0.066667,0.285714
5,4.0,F,Senior Citizens,Obesity,YES,1.0,0,1,1,1.428571,1.25,-0.241379,0.759259,1.735279,1.266667,5.571429
6,3.0,M,Middle-Aged Adults,Overweight,NO,0.0,1,1,0,0.428571,0.0,-0.965517,0.87037,0.728737,2.333333,1.142857
7,1.0,F,Young Adults,Normal Weight,YES,0.0,0,0,1,-1.071429,0.75,-2.137931,-0.351852,-0.479114,0.0,3.714286
8,1.0,F,Middle-Aged Adults,Obesity,NO,1.0,0,1,0,0.214286,0.0,-0.413793,0.092593,1.131354,0.533333,2.428571
9,3.0,M,Senior Citizens,Normal Weight,NO,0.0,0,0,0,1.142857,0.0,-1.103448,-0.462963,-0.177151,0.133333,0.785714


In [252]:
columns_without_order = ['education', 'sex', 'is_smoking']

In [253]:
def encoding_onehotencoder(dataframe, columns, oh):

    transform = oh.transform(dataframe[columns])
    oh_dataframe = pd.DataFrame(transform.toarray(), columns = oh.get_feature_names_out())

    dataframe = pd.concat([dataframe,oh_dataframe],axis=1)

    dataframe.drop(columns = columns, inplace = True)

    return dataframe

In [254]:
patients_df = encoding_onehotencoder(patients_df, columns_without_order, oh_encoding)

In [255]:
patients_df

Unnamed: 0,age_category,BMI_category,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose,education_1.0,education_2.0,education_3.0,education_4.0,sex_F,sex_M,is_smoking_NO,is_smoking_YES
0,Senior Citizens,Obesity,1.0,1,1,1,1.0,2.0,0.827586,1.092593,3.144439,2.0,8.142857,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,Middle-Aged Adults,Underweight,0.0,0,0,1,0.642857,1.35,0.189655,-0.148148,-1.586311,1.6,11.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,Young Adults,Normal Weight,1.0,0,1,0,-1.214286,0.0,-1.793103,0.425926,-0.579768,0.266667,2.285714,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,Senior Citizens,Overweight,0.0,0,0,0,1.5,0.0,-0.603448,-0.055556,0.527428,1.133333,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,Middle-Aged Adults,Normal Weight,0.0,0,0,0,-0.714286,0.0,-1.275862,-0.685185,-0.277806,0.066667,0.285714,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5,Senior Citizens,Obesity,1.0,0,1,1,1.428571,1.25,-0.241379,0.759259,1.735279,1.266667,5.571429,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,Middle-Aged Adults,Overweight,0.0,1,1,0,0.428571,0.0,-0.965517,0.87037,0.728737,2.333333,1.142857,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
7,Young Adults,Normal Weight,0.0,0,0,1,-1.071429,0.75,-2.137931,-0.351852,-0.479114,0.0,3.714286,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,Middle-Aged Adults,Obesity,1.0,0,1,0,0.214286,0.0,-0.413793,0.092593,1.131354,0.533333,2.428571,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,Senior Citizens,Normal Weight,0.0,0,0,0,1.142857,0.0,-1.103448,-0.462963,-0.177151,0.133333,0.785714,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


CATEGORIES WITH A SPECIFIC ORDER

In [256]:
dict_bpmeds = {0.0: 0, 1.0: 2}
dict_prevalentstroke = {0.0: 0, 1.0: 3}
dict_diabetes = {0.0: 0, 1.0: 2}
dict_bmi_category = {'Normal Weight': 0, 'Overweight': 0.25, 'Underweight': 0.5, 'Obesity': 0.5}
dict_age_category = {'Young Adults': 0, 'Middle-Aged Adults': 0.5, 'Senior Citizens': 2}

In [257]:
columns_map = ['BPMeds', 'prevalentStroke', 'diabetes', 'BMI_category', 'age_category']
maps_encoding = [dict_bpmeds, dict_prevalentstroke, dict_diabetes, dict_bmi_category, dict_age_category ]

In [258]:
def encoding_map(dataframe, maps, columns):

    for map_encoding, col in zip(maps, columns):
        dataframe[col] = dataframe[col].map(map_encoding)

    return dataframe

In [259]:
patients_df = encoding_map(patients_df, maps_encoding, columns_map)

In [260]:
patients_df

Unnamed: 0,age_category,BMI_category,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose,education_1.0,education_2.0,education_3.0,education_4.0,sex_F,sex_M,is_smoking_NO,is_smoking_YES
0,2.0,0.5,2,3,1,2,1.0,2.0,0.827586,1.092593,3.144439,2.0,8.142857,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.5,0.5,0,0,0,2,0.642857,1.35,0.189655,-0.148148,-1.586311,1.6,11.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,2,0,1,0,-1.214286,0.0,-1.793103,0.425926,-0.579768,0.266667,2.285714,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,2.0,0.25,0,0,0,0,1.5,0.0,-0.603448,-0.055556,0.527428,1.133333,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.5,0.0,0,0,0,0,-0.714286,0.0,-1.275862,-0.685185,-0.277806,0.066667,0.285714,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5,2.0,0.5,2,0,1,2,1.428571,1.25,-0.241379,0.759259,1.735279,1.266667,5.571429,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,0.5,0.25,0,3,1,0,0.428571,0.0,-0.965517,0.87037,0.728737,2.333333,1.142857,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
7,0.0,0.0,0,0,0,2,-1.071429,0.75,-2.137931,-0.351852,-0.479114,0.0,3.714286,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,0.5,0.5,2,0,1,0,0.214286,0.0,-0.413793,0.092593,1.131354,0.533333,2.428571,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,2.0,0.0,0,0,0,0,1.142857,0.0,-1.103448,-0.462963,-0.177151,0.133333,0.785714,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


#### PREDICTION WITH XG BOOST

In [261]:
with open ('../data/predict_objects/xg_boost_pred.pkl', 'rb') as xg_boost:
        xg_boost_pred = pickle.load(xg_boost)

In [262]:
list_results_xg_boost = list(xg_boost_pred.predict(patients_df))

In [263]:
list_results_xg_boost

[0.7437923,
 0.82202846,
 -0.023780288,
 0.8693796,
 0.05623845,
 0.8723959,
 0.48763177,
 0.10693355,
 0.3662709,
 0.39788646]

In [264]:
def convert_to_percentage(pred_list):

    for num in range(len(pred_list)):
        pred_list[num] = round(pred_list[num] * 100, 2)

    return pred_list

In [265]:
list_results_xg_boost = convert_to_percentage(list_results_xg_boost)
list_results_xg_boost

[74.38, 82.2, -2.38, 86.94, 5.62, 87.24, 48.76, 10.69, 36.63, 39.79]

In [266]:
def print_results_xg_boost(pred_list):
    
    for index, patient in enumerate(pred_list):

        print(f'The risk of coronary heart disease in Patient {index + 1} is {patient} %')

        if patient > 50:
            print(f'Prediction: YES, the Patient {index + 1} is at risk of a coronary heart disease.\n')
        else:
            print(f'Prediction: NO, the Patient {index + 1} is not at significant risk of a coronary heart disease.\n')

In [267]:
print_results_xg_boost(list_results_xg_boost)

The risk of coronary heart disease in Patient 1 is 74.38 %
Prediction: YES, the Patient 1 is at risk of a coronary heart disease.

The risk of coronary heart disease in Patient 2 is 82.2 %
Prediction: YES, the Patient 2 is at risk of a coronary heart disease.

The risk of coronary heart disease in Patient 3 is -2.38 %
Prediction: NO, the Patient 3 is not at significant risk of a coronary heart disease.

The risk of coronary heart disease in Patient 4 is 86.94 %
Prediction: YES, the Patient 4 is at risk of a coronary heart disease.

The risk of coronary heart disease in Patient 5 is 5.62 %
Prediction: NO, the Patient 5 is not at significant risk of a coronary heart disease.

The risk of coronary heart disease in Patient 6 is 87.24 %
Prediction: YES, the Patient 6 is at risk of a coronary heart disease.

The risk of coronary heart disease in Patient 7 is 48.76 %
Prediction: NO, the Patient 7 is not at significant risk of a coronary heart disease.

The risk of coronary heart disease in Pa

#### PREDICTION WITH RANDOM FOREST

In [268]:
with open ('../data/predict_objects/random_forest_pred.pkl', 'rb') as rf:
        random_forest_pred = pickle.load(rf)

In [269]:
list_results_rf = list(random_forest_pred.predict(patients_df))

In [270]:
list_results_rf 

[1, 1, 0, 1, 0, 1, 1, 0, 0, 0]

In [271]:
def print_results_random_forest(pred_list):
    
    for index, patient in enumerate(pred_list):

        if patient > 0.5:
            print(f'Prediction: YES, the Patient {index + 1} is at risk of a coronary heart disease.\n')
        else:
            print(f'Prediction: NO, the Patient {index + 1} is not at significant risk of a coronary heart disease.\n')

In [272]:
print_results_random_forest(list_results_rf)

Prediction: YES, the Patient 1 is at risk of a coronary heart disease.

Prediction: YES, the Patient 2 is at risk of a coronary heart disease.

Prediction: NO, the Patient 3 is not at significant risk of a coronary heart disease.

Prediction: YES, the Patient 4 is at risk of a coronary heart disease.

Prediction: NO, the Patient 5 is not at significant risk of a coronary heart disease.

Prediction: YES, the Patient 6 is at risk of a coronary heart disease.

Prediction: YES, the Patient 7 is at risk of a coronary heart disease.

Prediction: NO, the Patient 8 is not at significant risk of a coronary heart disease.

Prediction: NO, the Patient 9 is not at significant risk of a coronary heart disease.

Prediction: NO, the Patient 10 is not at significant risk of a coronary heart disease.



#### CONCLUSIONS

After comparing the outputs of both models in predicting the risk of developing coronary heart disease in new data, we have consistently obtained similar results in 90% of the cases. Considering that both models have performed well in terms of metrics, we can reasonably conclude that our models correctly forecast the response variable.