In [137]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,8)

from itertools import combinations
import researchpy as rp

from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import OneHotEncoder 

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = None 

In [138]:
df = pd.read_pickle('../data/cardio_risk.pkl')
df.head()

Unnamed: 0,education,sex,age_category,BMI_category,is_smoking,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,2.0,F,Senior Citizens,Overweight,YES,0.0,0,0,0,64.0,3.0,221.0,148.0,85.0,27.62,90.0,80.0,1
1,4.0,M,Middle-Aged Adults,Overweight,NO,0.0,0,1,0,36.0,0.0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,1.0,F,Middle-Aged Adults,Normal Weight,YES,0.0,0,0,0,46.0,10.0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,1.0,M,Middle-Aged Adults,Overweight,YES,0.0,0,1,0,50.0,20.0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,1.0,F,Senior Citizens,Overweight,YES,0.0,0,0,0,64.0,30.0,241.0,136.5,85.0,26.42,70.0,77.0,0


In [139]:
df.drop(['diaBP'], axis = 1, inplace = True)

In [140]:
patient_1 = {'education': 4.0, 'sex': 'F', 'age_category': 'Senior Citizens', 'BMI_category': 'Overweight', 'is_smoking': 'YES', 'BPMeds': 1.0, 'prevalentStroke': 1,
        'prevalentHyp': 1, 'diabetes': 1, 'age': 63, 'cigsPerDay': 30.0, 'totChol': 242.0, 'sysBP': 138.0, 'BMI': 29.75, 'heartRate': 99.0, 'glucose': 182.0}

In [141]:
patient_1_df = pd.DataFrame (patient_1, index = [0])
patient_1_df 

Unnamed: 0,education,sex,age_category,BMI_category,is_smoking,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,4.0,F,Senior Citizens,Overweight,YES,1.0,1,1,1,63,30.0,242.0,138.0,29.75,99.0,182.0


In [142]:
categorical_list = ['education', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'sex', 'is_smoking']

for col in categorical_list:
    patient_1_df [col] = patient_1_df [col].astype('category')

In [143]:
patient_1_df ['BMI_category'] = pd.cut(df['BMI'], bins=[0, 18.5, 24.9, 29.9, float('inf')], labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity'])
patient_1_df ['age_category'] = pd.cut(df['age'], bins=[0, 17, 35, 59, float('inf')], labels=['Minors', 'Young Adults', 'Middle-Aged Adults', 'Senior Citizens'])

In [144]:
new_order_columns = ['education', 'sex', 'age_category', 'BMI_category', 'is_smoking', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes',
                    'age', 'cigsPerDay', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']

In [145]:
patient_1_df  = patient_1_df [new_order_columns]

In [146]:
patient_1_df_num = patient_1_df.select_dtypes(include = np.number)
patient_1_df_cat = patient_1_df.select_dtypes(include = ['category'])

##### SCALING

In [147]:
with open('../data/robust_scaler.pkl', mode = "rb") as robust_scaler:
    scaler = pickle.load(robust_scaler)

In [149]:
num_scaler = pd.DataFrame(scaler.transform(patient_1_df_num), columns = patient_1_df_num.columns)
num_scaler

Unnamed: 0,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,1.0,1.5,0.137931,0.351852,0.879718,1.6,7.428571


In [116]:
patient_1_df[num_scaler.columns] = num_scaler

##### ENCODING

CATEGORIES WITHOUT A SPECIFIC ORDER

In [117]:
with open('../data/onehotencoder.pkl', mode = "rb") as onehotencoder:
    oh_encoding = pickle.load(onehotencoder)

In [119]:
columns_without_order = ['education', 'sex', 'is_smoking']

In [120]:
patient_1_df

Unnamed: 0,education,sex,age_category,BMI_category,is_smoking,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose
0,4.0,F,Senior Citizens,Overweight,YES,1.0,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
for col in columns_without_order:
    transform = oh_encoding.transform(patient_1_df[[col]])
    oh_df = pd.DataFrame(transform.toarray())
    oh_df.columns = oh_encoding.get_feature_names_out()
    patient_1_df[oh_df.columns] = oh_df

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- education
Feature names seen at fit time, yet now missing:
- sex


In [102]:
for col in columns_without_order:
    if col in columns_without_order:
        patient_1_df.drop([col], axis = 1, inplace = True)
    else:
        pass

In [None]:
patient_1_df

Unnamed: 0,age_category,BMI_category,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose,education_4.0,is_smoking_YES,sex_F
0,Senior Citizens,Overweight,1.0,1,1,1,63,30.0,242.0,138.0,29.75,99.0,182.0,1.0,1.0,1.0


CATEGORIES WITH A SPECIFIC ORDER

In [None]:
dict_bpmeds = {0.0: 0, 1.0: 2}
dict_prevalentstroke = {0.0: 0, 1.0: 3}
dict_diabetes = {0.0: 0, 1.0: 2}
dict_bmi_category = {'Normal Weight': 0, 'Overweight': 0.25, 'Underweight': 0.5, 'Obesity': 0.5}
dict_age_category = {'Young Adults': 0, 'Middle-Aged Adults': 0.5, 'Senior Citizens': 2}

In [None]:
columns_map = ['BPMeds', 'prevalentStroke', 'diabetes', 'BMI_category', 'age_category']
maps_encoding = [dict_bpmeds, dict_prevalentstroke, dict_diabetes, dict_bmi_category, dict_age_category ]

In [None]:
def encoding_map(dataframe, maps, columns):

    for map_encoding, col in zip(maps, columns):
        dataframe[col] = dataframe[col].map(map_encoding)

    return dataframe

In [None]:
patient_1_df = encoding_map(patient_1_df, maps_encoding, columns_map)

##### PREDICTION

In [None]:
patient_1_df

Unnamed: 0,age_category,BMI_category,BPMeds,prevalentStroke,prevalentHyp,diabetes,age,cigsPerDay,totChol,sysBP,BMI,heartRate,glucose,education_4.0,is_smoking_YES,sex_F
0,2.0,0.25,2,3,1,2,63,30.0,242.0,138.0,29.75,99.0,182.0,1.0,1.0,1.0


In [None]:
with open ('../data/xg_boost_pred.pkl', 'rb') as xg_boost:
        best_model = pickle.load(xg_boost)

In [None]:
best_model.predict(patient_1_df)

ValueError: feature_names mismatch: ['age_category', 'BMI_category', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'age', 'cigsPerDay', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'education_1.0', 'education_2.0', 'education_3.0', 'education_4.0', 'is_smoking_NO', 'is_smoking_YES', 'sex_F', 'sex_M'] ['age_category', 'BMI_category', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'age', 'cigsPerDay', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'education_4.0', 'is_smoking_YES', 'sex_F']
expected education_3.0, sex_M, is_smoking_NO, education_1.0, education_2.0 in input data