In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

# <font color='#0000FF'>Table of Contents</font>

[1 : Exploring the data](#1)

[2 : Normalization of variables for modeling](#2)

<p style="padding:10px;background-color:#B9B7BD;margin:0;color:#000C66;font-family:sans serif;font-size:240%;text-align:center; overflow:hidden; font-weight:500; font-style:italic"><a id='1'></a>1. Exploring the data</p>

<p style="text-align:center; "></p>

Data exploration is a crucial step in the data analysis process. This phase allows for understanding the nature of the data, identifying trends, patterns, and laying the groundwork for more in-depth analyses. Here are some steps and techniques commonly used during data exploration:

- Understanding the Data: start by examining basic data features such as dataset size, variable types, and the initial rows to gain an initial overview.

- Descriptive Statistics: calculate descriptive statistics like mean, median, standard deviation, etc., to get an idea of the distribution of numerical variables.

- Visualization: Use graphs to visualize the data. Histograms, box plots, and scatter plots are useful for understanding the distribution, dispersion, and relationships between variables.

- Correlation Analysis: explore the relationships between variables by calculating correlations. This can reveal interesting associations or potential collinearities.

- Segmentation: if the data allows, perform segmentation to identify homogeneous subgroups. This can help tailor analyses based on specific characteristics.

- Preliminary Statistical Tests: if needed, conduct preliminary statistical tests to assess normality, equality of variances, etc.

In [2]:
df = pd.read_csv('heart_disease_2022_cleaned.csv')
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              24

In [6]:
df.shape

(246022, 40)

In [5]:
df.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,246022.0,246022.0,246022.0,246022.0,246022.0,246022.0
mean,4.119026,4.16714,7.021331,1.70515,83.615179,28.668136
std,8.405844,8.102687,1.440681,0.106654,21.323156,6.513973
min,0.0,0.0,1.0,0.91,28.12,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.27
50%,0.0,0.0,7.0,1.7,81.65,27.46
75%,3.0,4.0,8.0,1.78,95.25,31.89
max,30.0,30.0,24.0,2.41,292.57,97.65


In [7]:
for column_content in df.columns:
    print(column_content)
    print(df[column_content].unique(),"\n")  

State
['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands'] 

Sex
['Female' 'Male'] 

GeneralHealth
['Very good' 'Fair' 'Good' 'Excellent' 'Poor'] 

PhysicalHealthDays
[ 4.  0.  5.  3.  2. 25. 30. 15. 29.  8. 16. 20. 10.  9.  7.  1. 21.  6.
 27. 14. 12. 11. 13. 28. 17. 23. 24. 26. 18. 22. 19.] 

MentalHealthDays
[ 0. 15.  4. 25.  5. 30. 27.  3.  2.  1. 10. 20. 21.  6.  7.  8. 14.  9.
 12. 18. 29. 28. 17. 11

We have 40 variables, both categorical and quantitative. Several of them are objects that contain different types of information. We have closed Yes or No responses, evaluation responses, quantity responses, etc. Regarding the Float variables, it is very heterogeneous, which is understandable given the nature of each variable. 

- Therefore, it is necessary to standardize and normalize all these data for predictive modeling


#### I will not present visualizations in this notebook as I have already done so in another notebook for the same dataframe LINK. Here, I will primarily focus on machine learning modeling. If you're interested, you can refer to my previous notebook to see how the data is distributed through various visualizations.


<p style="padding:10px;background-color:#B9B7BD;margin:0;color:#000C66;font-family:sans serif;font-size:240%;text-align:center; overflow:hidden; font-weight:500; font-style:italic"><a id='2'></a>2. Normalization of variables for modeling</p>

<p style="text-align:center; "></p>

There will be a first step to transform categorical variables into numerical. I will use Replace() and lambda(). 

Then, there will be normalization of numerical variables with MinMaxScaler(), which will be done once the data has been split (X_train and X_test) to avoid overfitting.

In [8]:
dict_replace = {'No':0, 'Yes' : 1} 
df = df.replace(dict_replace)

dict_GeneralHealth = {'Excellent': 0, 'Very good' : 1, 'Good' : 2, 'Fair': 3, 'Poor': 4}
df['GeneralHealth'] = df['GeneralHealth'].replace(dict_GeneralHealth)

dict_RemovedTeeth = {'None of them': 0, '1 to 5' : 1, '6 or more, but not all' : 2, 'All': 3}
df['RemovedTeeth'] = df['RemovedTeeth'].replace(dict_RemovedTeeth)

dict_HadDiabetes = {'No': 0, 'Yes' : 1, 'Yes, but only during pregnancy (female)' : 2, 
                    'No, pre-diabetes or borderline diabetes': 3}
df['HadDiabetes'] = df['HadDiabetes'].replace(dict_HadDiabetes)

dict_SmokerStatus = {'Never smoked': 0, 'Current smoker - now smokes some days' : 1, 
                     'Current smoker - now smokes every day' : 2, 'Former smoker': 3}
df['SmokerStatus'] = df['SmokerStatus'].replace(dict_SmokerStatus)

dict_ECigaretteUsage = {'Never used e-cigarettes in my entire life': 0, 
                        'Use them every day' : 1, 'Use them some days' : 2, 'Not at all (right now)': 3}
df['ECigaretteUsage'] = df['ECigaretteUsage'].replace(dict_ECigaretteUsage)

dict_TetanusLast10Tdap = {'Yes, received Tdap': 0,'Yes, received tetanus shot but not sure what type' : 1, 
                          'Yes, received tetanus shot, but not Tdap' : 2, 
                          'No, did not receive any tetanus shot in the past 10 years': 3}
df['TetanusLast10Tdap'] = df['TetanusLast10Tdap'].replace(dict_TetanusLast10Tdap)

dict_CovidPos = {'No': 0, 'Yes' : 1, 'Tested positive using home test without a health professional' : 2}
df['CovidPos'] = df['CovidPos'].replace(dict_CovidPos)

In [9]:
df['BMI'] = df['BMI'].apply(lambda x : 0 if x <= 24.9 else 1)
df['BMI'].value_counts()

1    173462
0     72560
Name: BMI, dtype: int64

In [10]:
df['Sex'] = df['Sex'].apply(lambda x : 0 if x == 'Female' else 1)
df['Sex'].value_counts()

0    127811
1    118211
Name: Sex, dtype: int64

In [11]:
df['RaceEthnicityCategory'] = df['RaceEthnicityCategory'].apply(lambda x : 0 if x =='White only, Non-Hispanic'else 1)
df['RaceEthnicityCategory'].value_counts()

0    186336
1     59686
Name: RaceEthnicityCategory, dtype: int64

In [12]:
df['AgeCategory'] = df['AgeCategory'].replace(to_replace = df['AgeCategory'].unique(), value =  np.arange(0,13,1))

In [13]:
for column_content in df.columns:
    print(column_content)
    print(df[column_content].unique(),"\n")  

State
['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands'] 

Sex
[0 1] 

GeneralHealth
[1 3 2 0 4] 

PhysicalHealthDays
[ 4.  0.  5.  3.  2. 25. 30. 15. 29.  8. 16. 20. 10.  9.  7.  1. 21.  6.
 27. 14. 12. 11. 13. 28. 17. 23. 24. 26. 18. 22. 19.] 

MentalHealthDays
[ 0. 15.  4. 25.  5. 30. 27.  3.  2.  1. 10. 20. 21.  6.  7.  8. 14.  9.
 12. 18. 29. 28. 17. 11. 16. 13. 26. 22. 24. 19. 23.] 

LastCheckupTim