# Data Preprocessing

In [2]:
import pandas as pd

## Read Dataset

In [3]:
df_raw = pd.read_csv('raw/heart_2020_cleaned.csv')
print(df_raw.shape)
df_raw.head()

(319795, 18)


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
# transfer yes value to 1, otherwise 0
df_transfer = df_raw.replace('Yes', 1)
df_transfer = df_transfer.replace('No', 0)
df_transfer.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,Female,55-59,White,1,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,0,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,1,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,0,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,0,1,Very good,8.0,0,0,0


### BMI
- 0~18.5
- 18.5~24
- 24~30
- 30~100

In [5]:
df_transfer['BMI'].min(), df_transfer['BMI'].max()

(12.02, 94.85)

In [6]:
df_transfer['BMI_0_18.5'] = (df_transfer['BMI'] < 18.5).astype(int)
df_transfer['BMI_18.5_24'] = ((df_transfer['BMI'] >= 18.5) & (df_transfer['BMI'] < 24)).astype(int)
df_transfer['BMI_24_30'] = ((df_transfer['BMI'] >= 24) & (df_transfer['BMI'] < 30)).astype(int)
df_transfer['BMI_30_100'] = (df_transfer['BMI'] >= 30).astype(int)
df_transfer = df_transfer.drop(columns=['BMI'], errors='ignore')
df_transfer.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,...,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_0_18.5,BMI_18.5_24,BMI_24_30,BMI_30_100
0,0,1,0,0,3.0,30.0,0,Female,55-59,White,...,1,Very good,5.0,1,0,1,1,0,0,0
1,0,0,0,1,0.0,0.0,0,Female,80 or older,White,...,1,Very good,7.0,0,0,0,0,1,0,0
2,0,1,0,0,20.0,30.0,0,Male,65-69,White,...,1,Fair,8.0,1,0,0,0,0,1,0
3,0,0,0,0,0.0,0.0,0,Female,75-79,White,...,0,Good,6.0,0,0,1,0,0,1,0
4,0,0,0,0,28.0,0.0,1,Female,40-44,White,...,1,Very good,8.0,0,0,0,0,1,0,0


In [7]:
df_transfer['PhysicalNotHealth'] = (df_transfer['PhysicalHealth'] > 0).astype(int)
df_transfer['MentalNotHealth'] = (df_transfer['MentalHealth'] > 0).astype(int)
df_transfer = df_transfer.drop(columns=['PhysicalHealth', 'MentalHealth'], errors='ignore')
df_transfer.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,...,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_0_18.5,BMI_18.5_24,BMI_24_30,BMI_30_100,PhysicalNotHealth,MentalNotHealth
0,0,1,0,0,0,Female,55-59,White,1,1,...,5.0,1,0,1,1,0,0,0,1,1
1,0,0,0,1,0,Female,80 or older,White,0,1,...,7.0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,Male,65-69,White,1,1,...,8.0,1,0,0,0,0,1,0,1,1
3,0,0,0,0,0,Female,75-79,White,0,0,...,6.0,0,0,1,0,0,1,0,0,0
4,0,0,0,0,1,Female,40-44,White,0,1,...,8.0,0,0,0,0,1,0,0,1,0


In [8]:
df_transfer['isMale'] = (df_transfer['Sex'] == 'Male').astype(int)
df_transfer['isFemale'] = (df_transfer['Sex'] == 'Female').astype(int)
df_transfer = df_transfer.drop(columns=['Sex'], errors='ignore')
df_transfer.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,...,KidneyDisease,SkinCancer,BMI_0_18.5,BMI_18.5_24,BMI_24_30,BMI_30_100,PhysicalNotHealth,MentalNotHealth,isMale,isFemale
0,0,1,0,0,0,55-59,White,1,1,Very good,...,0,1,1,0,0,0,1,1,0,1
1,0,0,0,1,0,80 or older,White,0,1,Very good,...,0,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,65-69,White,1,1,Fair,...,0,0,0,0,1,0,1,1,1,0
3,0,0,0,0,0,75-79,White,0,0,Good,...,0,1,0,0,1,0,0,0,0,1
4,0,0,0,0,1,40-44,White,0,1,Very good,...,0,0,0,1,0,0,1,0,0,1


### Age

In [9]:
df_transfer['AgeCategory'].value_counts()

65-69          34151
60-64          33686
70-74          31065
55-59          29757
50-54          25382
80 or older    24153
45-49          21791
75-79          21482
18-24          21064
40-44          21006
35-39          20550
30-34          18753
25-29          16955
Name: AgeCategory, dtype: int64

In [10]:
# 18-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80+
df_transfer['Age_18_29'] = ((df_transfer['AgeCategory'] == '18-24') | (df_transfer['AgeCategory'] == '25-29')).astype(int)
df_transfer['Age_30_39'] = ((df_transfer['AgeCategory'] == '30-34') | (df_transfer['AgeCategory'] == '35-39')).astype(int)
df_transfer['Age_40_49'] = ((df_transfer['AgeCategory'] == '40-44') | (df_transfer['AgeCategory'] == '45-49')).astype(int)
df_transfer['Age_50_59'] = ((df_transfer['AgeCategory'] == '50-54') | (df_transfer['AgeCategory'] == '55-59')).astype(int)
df_transfer['Age_60_69'] = ((df_transfer['AgeCategory'] == '60-64') | (df_transfer['AgeCategory'] == '65-69')).astype(int)
df_transfer['Age_70_79'] = ((df_transfer['AgeCategory'] == '70-74') | (df_transfer['AgeCategory'] == '75-79')).astype(int)
df_transfer['Age_80'] = (df_transfer['AgeCategory'] == '80 or older').astype(int)
df_transfer = df_transfer.drop(columns=['AgeCategory'], errors='ignore')
df_transfer.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,...,MentalNotHealth,isMale,isFemale,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80
0,0,1,0,0,0,White,1,1,Very good,5.0,...,1,0,1,0,0,0,1,0,0,0
1,0,0,0,1,0,White,0,1,Very good,7.0,...,0,0,1,0,0,0,0,0,0,1
2,0,1,0,0,0,White,1,1,Fair,8.0,...,1,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,White,0,0,Good,6.0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,1,White,0,1,Very good,8.0,...,0,0,1,0,0,1,0,0,0,0


### Sleep time

In [11]:
df_transfer['SleepTime'].value_counts()

7.0     97751
8.0     97602
6.0     66721
5.0     19184
9.0     16041
10.0     7796
4.0      7750
12.0     2205
3.0      1992
2.0       788
1.0       551
11.0      415
14.0      243
16.0      236
15.0      189
18.0      102
13.0       97
20.0       64
24.0       30
17.0       21
22.0        9
19.0        3
23.0        3
21.0        2
Name: SleepTime, dtype: int64

In [12]:
# 0-6, 6~8, 8~
df_transfer['SleepTime_0_6'] = (df_transfer['SleepTime'] < 6).astype(int)
df_transfer['SleepTime_6_8'] = ((df_transfer['SleepTime'] >= 6) & (df_transfer['SleepTime'] <= 8)).astype(int)
# df_transfer['SleepTime_6_8'].value_counts()
df_transfer['SleepTime_8'] = (df_transfer['SleepTime'] > 8).astype(int)
df_transfer = df_transfer.drop(columns=['SleepTime'], errors='ignore')
df_transfer.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,...,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80,SleepTime_0_6,SleepTime_6_8,SleepTime_8
0,0,1,0,0,0,White,1,1,Very good,1,...,0,0,0,1,0,0,0,1,0,0
1,0,0,0,1,0,White,0,1,Very good,0,...,0,0,0,0,0,0,1,0,1,0
2,0,1,0,0,0,White,1,1,Fair,1,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,White,0,0,Good,0,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,1,White,0,1,Very good,0,...,0,0,1,0,0,0,0,0,1,0


In [13]:
df_transfer.columns

Index(['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma',
       'KidneyDisease', 'SkinCancer', 'BMI_0_18.5', 'BMI_18.5_24', 'BMI_24_30',
       'BMI_30_100', 'PhysicalNotHealth', 'MentalNotHealth', 'isMale',
       'isFemale', 'Age_18_29', 'Age_30_39', 'Age_40_49', 'Age_50_59',
       'Age_60_69', 'Age_70_79', 'Age_80', 'SleepTime_0_6', 'SleepTime_6_8',
       'SleepTime_8'],
      dtype='object')

In [14]:
df_transfer['Diabetic'].value_counts()

0                          269653
1                           40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: Diabetic, dtype: int64

In [15]:
# replace "No, borderline diabetes" with 0, "Yes (during pregnancy) with 1"
df_transfer['Diabetic'] = df_transfer['Diabetic'].replace('No, borderline diabetes', 0)
df_transfer['Diabetic'] = df_transfer['Diabetic'].replace('Yes (during pregnancy)', 1)
df_transfer['Diabetic'].value_counts()

0    276434
1     43361
Name: Diabetic, dtype: int64

In [16]:
# reverse the value of "PhysicalActivity"
df_transfer['PhysicalActivity'] = 1 - df_transfer['PhysicalActivity']

### Drop GenHealth

In [17]:
df_heart = df_transfer.drop(columns=['GenHealth'], errors='ignore')

### Race

In [18]:
df_heart['Race'].value_counts()

White                             245212
Hispanic                           27446
Black                              22939
Other                              10928
Asian                               8068
American Indian/Alaskan Native      5202
Name: Race, dtype: int64

In [19]:
# asian, American Indian/Alaskan Native, others
df_asian = df_heart[df_heart['Race'] == 'Asian'].reset_index(drop=True)
df_asian = df_asian.drop(columns=['Race'], errors='ignore')
df_indigenous = df_heart[df_heart['Race'] == 'American Indian/Alaskan Native'].reset_index(drop=True)
df_indigenous = df_indigenous.drop(columns=['Race'], errors='ignore')
df_others = df_heart[df_heart['Race'] != 'Asian']
df_others = df_others[df_others['Race'] != 'American Indian/Alaskan Native'].reset_index(drop=True)
df_others = df_others.drop(columns=['Race'], errors='ignore')

In [20]:
df_asian['HeartDisease'].value_counts()

0    7802
1     266
Name: HeartDisease, dtype: int64

In [21]:
df_others['HeartDisease'].value_counts()

0    279960
1     26565
Name: HeartDisease, dtype: int64

## Save

In [22]:
df_heart.to_csv('clean/heart/heart_all.csv', index=False)

In [23]:
print(df_asian.shape)
print(df_indigenous.shape)
print(df_others.shape)


df_asian.to_csv('clean/heart/heart_part1.csv', index=False)
df_indigenous.to_csv('clean/heart/heart_part2.csv', index=False)
df_others.to_csv('clean/heart/heart_part3.csv', index=False)

(8068, 28)
(5202, 28)
(306525, 28)
