# Data Preprocessing

In [1]:
import pandas as pd

## Read Dataset

In [11]:
df_raw = pd.read_csv('raw/diabetes_binary_health_indicators_BRFSS2015.csv')
print(df_raw.shape)
df_raw.head()

(253680, 22)


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Deal with one-hot columns
- GenHlth
- Age
- Education
- Income

In [12]:
columns = ['GenHlth', 'Age', 'Education', 'Income']
df_raw[columns] = df_raw[columns].astype(str)
df_dummy = pd.get_dummies(df_raw, columns=columns, prefix=columns)
df_dummy.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0,1,1,0,0,0,0,0,0,0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1,0,0,0,0,1,0,0,0,0


### Filter who has diabetes

In [13]:
df_diabetes = df_dummy[df_dummy['Diabetes_binary'] == 1]
df_diabetes = df_diabetes.drop(columns=['Diabetes_binary'], errors='ignore')
print(df_diabetes.shape)
df_diabetes.head()

(35346, 49)


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
8,1.0,1.0,1.0,30.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1,0,1,0,0,0,0,0,0,0
10,0.0,0.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,0,0,0,1
13,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0
17,0.0,0.0,1.0,23.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,0,0,0,1,0,0
23,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1,0,0,0,0,1,0,0,0,0


### BMI
- 0~18.5
- 18.5~24
- 24~27
- 27~30
- 30~35
- 35~100

In [17]:
df_diabetes['BMI'].min(), df_diabetes['BMI'].max()

(13.0, 98.0)

In [21]:
df_diabetes['BMI_0_18.5'] = (df_diabetes['BMI'] < 18.5).astype(int)
df_diabetes['BMI_18.5_24'] = ((df_diabetes['BMI'] >= 18.5) & (df_diabetes['BMI'] < 24)).astype(int)
df_diabetes['BMI_24_27'] = ((df_diabetes['BMI'] >= 24) & (df_diabetes['BMI'] < 27)).astype(int)
df_diabetes['BMI_27_30'] = ((df_diabetes['BMI'] >= 27) & (df_diabetes['BMI'] < 30)).astype(int)
df_diabetes['BMI_30_35'] = ((df_diabetes['BMI'] >= 30) & (df_diabetes['BMI'] < 35)).astype(int)
df_diabetes['BMI_35_100'] = (df_diabetes['BMI'] >= 35).astype(int)
df_diabetes = df_diabetes.drop(columns=['BMI'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,Income_5.0,Income_6.0,Income_7.0,Income_8.0,BMI_0_18.5,BMI_18.5_24,BMI_24_27,BMI_27_30,BMI_30_35,BMI_35_100
8,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
10,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,1,0,0,1,0,0,0
13,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,0,1,0,0
17,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,0
23,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0


### MentHlth
- 0
- 1~7
- 8~14
- 15~21
- 22~28
- 29~30

In [19]:
df_diabetes['MentHlth'].value_counts().sort_index()

0.0     23403
1.0       812
2.0      1508
3.0       924
4.0       489
5.0      1223
6.0       164
7.0       405
8.0       110
9.0        13
10.0     1064
11.0        3
12.0       67
13.0        8
14.0      198
15.0     1023
16.0       14
17.0       11
18.0       20
19.0        4
20.0      663
21.0       48
22.0       11
23.0        8
24.0        6
25.0      273
26.0        7
27.0       12
28.0       57
29.0       30
30.0     2768
Name: MentHlth, dtype: int64

In [22]:
df_diabetes['MentHlth_0'] = (df_diabetes['MentHlth'] == 0).astype(int)
df_diabetes['MentHlth_1_7'] = ((df_diabetes['MentHlth'] >= 1) & (df_diabetes['MentHlth'] <= 7)).astype(int)
df_diabetes['MentHlth_8_14'] = ((df_diabetes['MentHlth'] >= 8) & (df_diabetes['MentHlth'] <= 14)).astype(int)
df_diabetes['MentHlth_15_21'] = ((df_diabetes['MentHlth'] >= 15) & (df_diabetes['MentHlth'] <= 21)).astype(int)
df_diabetes['MentHlth_22_28'] = ((df_diabetes['MentHlth'] >= 22) & (df_diabetes['MentHlth'] <= 28)).astype(int)
df_diabetes['MentHlth_29_30'] = ((df_diabetes['MentHlth'] >= 29) & (df_diabetes['MentHlth'] <= 30)).astype(int)
df_diabetes = df_diabetes.drop(columns=['MentHlth'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,BMI_24_27,BMI_27_30,BMI_30_35,BMI_35_100,MentHlth_0,MentHlth_1_7,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30
8,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,1
10,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1,0,0,0,1,0,0,0,0,0
13,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,0,0,1,0,0,0,0,0
17,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
23,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,1,0,0,1,0,0,0,0,0


### PhysHlth
- 0
- 1~7
- 8~14
- 15~21
- 22~28
- 29~30

In [20]:
df_diabetes['PhysHlth'].value_counts().sort_index()

0.0     16740
1.0      1188
2.0      2028
3.0      1289
4.0       763
5.0      1314
6.0       282
7.0       743
8.0       159
9.0        36
10.0     1290
11.0       12
12.0      129
13.0       16
14.0      473
15.0     1295
16.0       34
17.0       27
18.0       35
19.0        3
20.0      917
21.0      139
22.0       23
23.0       16
24.0       15
25.0      394
26.0       22
27.0       21
28.0      143
29.0       74
30.0     5726
Name: PhysHlth, dtype: int64

In [23]:
df_diabetes['PhysHlth_0'] = (df_diabetes['PhysHlth'] == 0).astype(int)
df_diabetes['PhysHlth_1_7'] = ((df_diabetes['PhysHlth'] >= 1) & (df_diabetes['PhysHlth'] <= 7)).astype(int)
df_diabetes['PhysHlth_8_14'] = ((df_diabetes['PhysHlth'] >= 8) & (df_diabetes['PhysHlth'] <= 14)).astype(int)
df_diabetes['PhysHlth_15_21'] = ((df_diabetes['PhysHlth'] >= 15) & (df_diabetes['PhysHlth'] <= 21)).astype(int)
df_diabetes['PhysHlth_22_28'] = ((df_diabetes['PhysHlth'] >= 22) & (df_diabetes['PhysHlth'] <= 28)).astype(int)
df_diabetes['PhysHlth_29_30'] = ((df_diabetes['PhysHlth'] >= 29) & (df_diabetes['PhysHlth'] <= 30)).astype(int)
df_diabetes = df_diabetes.drop(columns=['PhysHlth'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30,PhysHlth_0,PhysHlth_1_7,PhysHlth_8_14,PhysHlth_15_21,PhysHlth_22_28,PhysHlth_29_30
8,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,1
10,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
13,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
17,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
23,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0


## Save

In [24]:
df_diabetes = df_diabetes.reset_index(drop=True)
print(df_diabetes.shape)

(35346, 64)


In [25]:
df_diabetes.to_csv('clean/diabetes_all.csv', index=False)

In [27]:
# random
df_diabetes = df_diabetes.sample(frac=1, random_state=0).reset_index(drop=True)
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30,PhysHlth_0,PhysHlth_1_7,PhysHlth_8_14,PhysHlth_15_21,PhysHlth_22_28,PhysHlth_29_30
0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [28]:
df_diabetes_part1 = df_diabetes.iloc[:12000]
df_diabetes_part2 = df_diabetes.iloc[12000:24000]
df_diabetes_part3 = df_diabetes.iloc[24000:]

df_diabetes_part1.to_csv('clean/diabetes_part1.csv', index=False)
df_diabetes_part2.to_csv('clean/diabetes_part2.csv', index=False)
df_diabetes_part3.to_csv('clean/diabetes_part3.csv', index=False)