In [1]:
import numpy as np
import pandas as pd

In [2]:
# Data extraction (Raw and uncleaned data)
df = pd.read_csv("heart_2020_raw.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


# Data preparation and cleaning

In [3]:
# Standardised capitalisation
df.columns = df.columns.str.upper()
df.head()

Unnamed: 0,HEARTDISEASE,BMI,SMOKING,ALCOHOLDRINKING,STROKE,PHYSICALHEALTH,MENTALHEALTH,DIFFWALKING,SEX,AGECATEGORY,RACE,DIABETIC,PHYSICALACTIVITY,GENHEALTH,SLEEPTIME,ASTHMA,KIDNEYDISEASE,SKINCANCER
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
##Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HEARTDISEASE      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   SMOKING           319795 non-null  object 
 3   ALCOHOLDRINKING   319795 non-null  object 
 4   STROKE            319795 non-null  object 
 5   PHYSICALHEALTH    319795 non-null  float64
 6   MENTALHEALTH      319795 non-null  float64
 7   DIFFWALKING       319795 non-null  object 
 8   SEX               319795 non-null  object 
 9   AGECATEGORY       319795 non-null  object 
 10  RACE              319795 non-null  object 
 11  DIABETIC          319795 non-null  object 
 12  PHYSICALACTIVITY  319795 non-null  object 
 13  GENHEALTH         319795 non-null  object 
 14  SLEEPTIME         319795 non-null  float64
 15  ASTHMA            319795 non-null  object 
 16  KIDNEYDISEASE     31

In [5]:
# Reading the data description, the following are the categorical variables

# Categoric Predictors
df[['SMOKING','ALCOHOLDRINKING','STROKE','DIFFWALKING','SEX','AGECATEGORY','RACE','DIABETIC','GENHEALTH','ASTHMA','KIDNEYDISEASE','SKINCANCER']].describe()

Unnamed: 0,SMOKING,ALCOHOLDRINKING,STROKE,DIFFWALKING,SEX,AGECATEGORY,RACE,DIABETIC,GENHEALTH,ASTHMA,KIDNEYDISEASE,SKINCANCER
count,319795,319795,319795,319795,319795,319795,319795,319795,319795,319795,319795,319795
unique,2,2,2,2,2,13,6,4,5,2,2,2
top,No,No,No,No,Female,65-69,White,No,Very good,No,No,No
freq,187887,298018,307726,275385,167805,34151,245212,269653,113858,276923,308016,289976


#### Creating a copy of the dataframe for encoding the categorical variables later (One hot encoding)

In [6]:
# Creating a copy of the dataframe for encoding the categorical variables later (One hot encoding)
df_copy = df.copy(deep = True)

# Making the "Diabetic" column into a binary categorical "Yes" and "No", regardless of Yes and No under difference circumstance
df_copy =  df_copy[df_copy.columns].replace({'No, borderline diabetes': 'No','Yes (during pregnancy)':'Yes' })

# Save it for data preprocessing later
df_copy.to_csv("1-cleaned-heart-disease-dataset.csv")

#### Temporary integer encoding for non-ordinal categorical data 

######  This is mainly for Exploratory Data Analysis whereby we need numeric data instead of categorical data to explore the correlation between the response variable and predictor variables (via heatmap)

In [7]:
df.nunique()

HEARTDISEASE           2
BMI                 3604
SMOKING                2
ALCOHOLDRINKING        2
STROKE                 2
PHYSICALHEALTH        31
MENTALHEALTH          31
DIFFWALKING            2
SEX                    2
AGECATEGORY           13
RACE                   6
DIABETIC               4
PHYSICALACTIVITY       2
GENHEALTH              5
SLEEPTIME             24
ASTHMA                 2
KIDNEYDISEASE          2
SKINCANCER             2
dtype: int64

Excluding BMI & PHYSICALHEALTH & MENTALHEALTH & SLEEPTIME

Most of the categorical variables seems to be binary, except AGECATEGORY & GENHEALTH & RACE & DIABETIC

For diabetic, there are 4 categories
Yes || No || No, borderline diabetes || Yes (during pregnancy)

We would combine these 4 categories into 2 categories, "Yes" and "No" to make it binary: 

"No" : consisting of "No" and "No, borderline diabetes"; and "Yes" consisting of "Yes" and "Yes(during pregnancy)"

In [8]:
df =  df[df.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
df['DIABETIC'] = df['DIABETIC'].astype(int)

# Save it for exploratory data analysis later
df.to_csv("2-cleaned-heart-disease-dataset.csv")

In [9]:
df.head()

Unnamed: 0,HEARTDISEASE,BMI,SMOKING,ALCOHOLDRINKING,STROKE,PHYSICALHEALTH,MENTALHEALTH,DIFFWALKING,SEX,AGECATEGORY,RACE,DIABETIC,PHYSICALACTIVITY,GENHEALTH,SLEEPTIME,ASTHMA,KIDNEYDISEASE,SKINCANCER
0,0,16.6,1,0,0,3.0,30.0,0,0,55-59,White,1,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,80 or older,White,0,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,65-69,White,1,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,75-79,White,0,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,40-44,White,0,1,Very good,8.0,0,0,0


In [10]:
##Check if any missing value
display(df.isnull().any())

HEARTDISEASE        False
BMI                 False
SMOKING             False
ALCOHOLDRINKING     False
STROKE              False
PHYSICALHEALTH      False
MENTALHEALTH        False
DIFFWALKING         False
SEX                 False
AGECATEGORY         False
RACE                False
DIABETIC            False
PHYSICALACTIVITY    False
GENHEALTH           False
SLEEPTIME           False
ASTHMA              False
KIDNEYDISEASE       False
SKINCANCER          False
dtype: bool

There are no missing data