In [6]:
# All project imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import minmax_scale

In [7]:
# Load the dataset
df = pd.read_csv("../Raw_data/heart_2020.csv")
# Drop columns that are not important to the dataset
df.drop(columns=["PhysicalHealth", "MentalHealth", "SleepTime"])
# Print the name of each column, as well as the unique values found in each column
for column in df:
    print("%s -> " % (column), df[column].unique())

HeartDisease ->  ['No' 'Yes']
BMI ->  [16.6  20.34 26.58 ... 62.42 51.46 46.56]
Smoking ->  ['Yes' 'No']
AlcoholDrinking ->  ['No' 'Yes']
Stroke ->  ['No' 'Yes']
PhysicalHealth ->  [ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.]
MentalHealth ->  [30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.]
DiffWalking ->  ['No' 'Yes']
Sex ->  ['Female' 'Male']
AgeCategory ->  ['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']
Race ->  ['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']
Diabetic ->  ['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)']
PhysicalActivity ->  ['Yes' 'No']
GenHealth ->  ['Very good' 'Fair' 'Good' 'Poor' 'Excellent']
SleepTime ->  [ 5.  7.  8.  6. 12.  4.  9. 10. 15.  3.  2.  1. 16. 18. 14. 20. 11. 13.
 17. 24. 19. 21. 

One-Hot-Encoding Step

In [11]:
# One-hot encode categorical attributes in the dataset

# List of all attribute that we wish to one-hot encode or binary encode
categorical_attributes = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Sex", "Race", "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]
# If an attribute has only two categories, only create one new attribute when encoding it.
# Normally, one-hot encoding would create two new attributes for binary categories.
encoder = OneHotEncoder(drop="if_binary")
# Generate an encoding scheme for the specified columns
encoder.fit(df[categorical_attributes])
# Print the names of all the new attributes created by the one-hot encoder
print(encoder.get_feature_names_out(categorical_attributes))
# Replace the categorical columns in the DataFrame with the new, one-hot encoded columns
encoded_df = pd.DataFrame(encoder.transform(df[categorical_attributes]).toarray(), columns=encoder.get_feature_names_out(categorical_attributes))
df = df.drop(columns=categorical_attributes).join(encoded_df)
print(df.head(1))

['HeartDisease_Yes' 'Smoking_Yes' 'AlcoholDrinking_Yes' 'Stroke_Yes'
 'DiffWalking_Yes' 'Sex_Male' 'Race_American Indian/Alaskan Native'
 'Race_Asian' 'Race_Black' 'Race_Hispanic' 'Race_Other' 'Race_White'
 'Diabetic_No' 'Diabetic_No, borderline diabetes' 'Diabetic_Yes'
 'Diabetic_Yes (during pregnancy)' 'PhysicalActivity_Yes' 'Asthma_Yes'
 'KidneyDisease_Yes' 'SkinCancer_Yes']
        BMI  PhysicalHealth  MentalHealth  SleepTime  AgeCategory_Ordinal  \
0  0.055294             3.0          30.0        5.0                  7.0   

   GenHealth_Ordinal  HeartDisease_Yes  Smoking_Yes  AlcoholDrinking_Yes  \
0                3.0               0.0          1.0                  0.0   

   Stroke_Yes  ...  Race_Other  Race_White  Diabetic_No  \
0         0.0  ...         0.0         1.0          0.0   

   Diabetic_No, borderline diabetes  Diabetic_Yes  \
0                               0.0           1.0   

   Diabetic_Yes (during pregnancy)  PhysicalActivity_Yes  Asthma_Yes  \
0            

Ordinal Encoding Step

In [9]:
# Encode categorical attributes that have a logical ordering. Since such an ordering exists,
# we can avoid creating extraneous columns by assigning a number to each category, rather than
# trying to use a one-hot encoding scheme for them.

# There are only two categorical attributes which have a logical ordering
categorical_attributes = ["AgeCategory", "GenHealth"]
# List the possible categories for each attribute in their logical order
age_categories = ["18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80 or older"]
health_categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
# Create the ordinal encoder and generate the encoding scheme for our DataFrame
encoder = OrdinalEncoder(categories=[age_categories, health_categories])
encoder.fit(df[categorical_attributes])
# Replace the categorical columns in the DataFrame with the new, ordinal-encoded columns
encoded_df = pd.DataFrame(encoder.transform(df[categorical_attributes]), columns=["AgeCategory_Ordinal", "GenHealth_Ordinal"])
df = df.drop(columns=categorical_attributes).join(encoded_df)
print(df.head(1))

  HeartDisease   BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.6     Yes              No     No             3.0   

   MentalHealth DiffWalking     Sex   Race Diabetic PhysicalActivity  \
0          30.0          No  Female  White      Yes              Yes   

   SleepTime Asthma KidneyDisease SkinCancer  AgeCategory_Ordinal  \
0        5.0    Yes            No        Yes                  7.0   

   GenHealth_Ordinal  
0                3.0  


Standardize BMI column using min-max scaling. Only one column affected.

In [10]:
df[['BMI']] = minmax_scale(df[['BMI']])
print(df.head(1))

  HeartDisease       BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  0.055294     Yes              No     No             3.0   

   MentalHealth DiffWalking     Sex   Race Diabetic PhysicalActivity  \
0          30.0          No  Female  White      Yes              Yes   

   SleepTime Asthma KidneyDisease SkinCancer  AgeCategory_Ordinal  \
0        5.0    Yes            No        Yes                  7.0   

   GenHealth_Ordinal  
0                3.0  
