# Data Preprocessing

In [1]:
import pandas as pd

## Read Dataset

In [2]:
df_raw = pd.read_csv('raw/diabetes_binary_health_indicators_BRFSS2015.csv')
print(df_raw.shape)
df_raw.head()

(253680, 22)


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Deal with one-hot columns
- GenHlth
- Age
- Education
- Income

In [3]:
columns = ['GenHlth', 'Age', 'Education', 'Income']
df_raw[columns] = df_raw[columns].astype(str)
df_dummy = pd.get_dummies(df_raw, columns=columns, prefix=columns)
df_dummy.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0,1,1,0,0,0,0,0,0,0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1,0,0,0,0,1,0,0,0,0


### Filter who has diabetes

In [4]:
# df_diabetes = df_dummy[df_dummy['Diabetes_binary'] == 1]
df_diabetes = df_dummy
df_diabetes = df_diabetes.drop(columns=['Diabetes_binary'], errors='ignore')
print(df_diabetes.shape)
df_diabetes.head()

(253680, 49)


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0,1,1,0,0,0,0,0,0,0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1,0,0,0,0,1,0,0,0,0


### BMI
- 0~18.5
- 18.5~24
- 24~27
- 27~30
- 30~35
- 35~100

In [5]:
df_diabetes['BMI'].min(), df_diabetes['BMI'].max()

(12.0, 98.0)

In [6]:
df_diabetes['BMI_0_18.5'] = (df_diabetes['BMI'] < 18.5).astype(int)
df_diabetes['BMI_18.5_24'] = ((df_diabetes['BMI'] >= 18.5) & (df_diabetes['BMI'] < 24)).astype(int)
df_diabetes['BMI_24_27'] = ((df_diabetes['BMI'] >= 24) & (df_diabetes['BMI'] < 27)).astype(int)
df_diabetes['BMI_27_30'] = ((df_diabetes['BMI'] >= 27) & (df_diabetes['BMI'] < 30)).astype(int)
df_diabetes['BMI_30_35'] = ((df_diabetes['BMI'] >= 30) & (df_diabetes['BMI'] < 35)).astype(int)
df_diabetes['BMI_35_100'] = (df_diabetes['BMI'] >= 35).astype(int)
df_diabetes = df_diabetes.drop(columns=['BMI'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,Income_5.0,Income_6.0,Income_7.0,Income_8.0,BMI_0_18.5,BMI_18.5_24,BMI_24_27,BMI_27_30,BMI_30_35,BMI_35_100
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,1,0,0
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,1,0,0,0,0,0,1,0,0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,0


### MentHlth
- 0
- 1~7
- 8~14
- 15~21
- 22~28
- 29~30

In [7]:
df_diabetes['MentHlth'].value_counts().sort_index()

0.0     175680
1.0       8538
2.0      13054
3.0       7381
4.0       3789
5.0       9030
6.0        988
7.0       3100
8.0        639
9.0         91
10.0      6373
11.0        41
12.0       398
13.0        41
14.0      1167
15.0      5505
16.0        88
17.0        54
18.0        97
19.0        16
20.0      3364
21.0       227
22.0        63
23.0        38
24.0        33
25.0      1188
26.0        45
27.0        79
28.0       327
29.0       158
30.0     12088
Name: MentHlth, dtype: int64

In [8]:
df_diabetes['MentHlth_0'] = (df_diabetes['MentHlth'] == 0).astype(int)
df_diabetes['MentHlth_1_7'] = ((df_diabetes['MentHlth'] >= 1) & (df_diabetes['MentHlth'] <= 7)).astype(int)
df_diabetes['MentHlth_8_14'] = ((df_diabetes['MentHlth'] >= 8) & (df_diabetes['MentHlth'] <= 14)).astype(int)
df_diabetes['MentHlth_15_21'] = ((df_diabetes['MentHlth'] >= 15) & (df_diabetes['MentHlth'] <= 21)).astype(int)
df_diabetes['MentHlth_22_28'] = ((df_diabetes['MentHlth'] >= 22) & (df_diabetes['MentHlth'] <= 28)).astype(int)
df_diabetes['MentHlth_29_30'] = ((df_diabetes['MentHlth'] >= 29) & (df_diabetes['MentHlth'] <= 30)).astype(int)
df_diabetes = df_diabetes.drop(columns=['MentHlth'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,BMI_24_27,BMI_27_30,BMI_30_35,BMI_35_100,MentHlth_0,MentHlth_1_7,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,1,0,0,0,1,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,1,0,0,0,0,0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,1,0,0,1,0,0,0,0,0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1,0,0,0,0,1,0,0,0,0


### PhysHlth
- 0
- 1~7
- 8~14
- 15~21
- 22~28
- 29~30

In [9]:
df_diabetes['PhysHlth'].value_counts().sort_index()

0.0     160052
1.0      11388
2.0      14764
3.0       8495
4.0       4542
5.0       7622
6.0       1330
7.0       4538
8.0        809
9.0        179
10.0      5595
11.0        60
12.0       578
13.0        68
14.0      2587
15.0      4916
16.0       112
17.0        96
18.0       152
19.0        22
20.0      3273
21.0       663
22.0        70
23.0        56
24.0        72
25.0      1336
26.0        69
27.0        99
28.0       522
29.0       215
30.0     19400
Name: PhysHlth, dtype: int64

In [10]:
df_diabetes['PhysHlth_0'] = (df_diabetes['PhysHlth'] == 0).astype(int)
df_diabetes['PhysHlth_1_7'] = ((df_diabetes['PhysHlth'] >= 1) & (df_diabetes['PhysHlth'] <= 7)).astype(int)
df_diabetes['PhysHlth_8_14'] = ((df_diabetes['PhysHlth'] >= 8) & (df_diabetes['PhysHlth'] <= 14)).astype(int)
df_diabetes['PhysHlth_15_21'] = ((df_diabetes['PhysHlth'] >= 15) & (df_diabetes['PhysHlth'] <= 21)).astype(int)
df_diabetes['PhysHlth_22_28'] = ((df_diabetes['PhysHlth'] >= 22) & (df_diabetes['PhysHlth'] <= 28)).astype(int)
df_diabetes['PhysHlth_29_30'] = ((df_diabetes['PhysHlth'] >= 29) & (df_diabetes['PhysHlth'] <= 30)).astype(int)
df_diabetes = df_diabetes.drop(columns=['PhysHlth'], errors='ignore')
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30,PhysHlth_0,PhysHlth_1_7,PhysHlth_8_14,PhysHlth_15_21,PhysHlth_22_28,PhysHlth_29_30
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,0,1,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,1
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0


### Drop CholCheck and AnyHealthcare

In [11]:
df_diabetes = df_diabetes.drop(columns=['CholCheck', 'AnyHealthcare'], errors='ignore')

### Flip some column value

In [12]:
# PhysActivity, Fruits, Veggies
# change 1 to 0, 0 to 1
df_diabetes['PhysActivity'] = (df_diabetes['PhysActivity'] == 0).astype(int)
df_diabetes['Fruits'] = (df_diabetes['Fruits'] == 0).astype(int)
df_diabetes['Veggies'] = (df_diabetes['Veggies'] == 0).astype(int)

## Save

In [13]:
df_diabetes = df_diabetes.reset_index(drop=True)
print(df_diabetes.shape)

(253680, 62)


In [14]:
df_diabetes.to_csv('clean/diabetes_all.csv', index=False)

In [15]:
# random
df_diabetes = df_diabetes.sample(frac=1, random_state=0).reset_index(drop=True)
df_diabetes.head()

Unnamed: 0,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,NoDocbcCost,...,MentHlth_8_14,MentHlth_15_21,MentHlth_22_28,MentHlth_29_30,PhysHlth_0,PhysHlth_1_7,PhysHlth_8_14,PhysHlth_15_21,PhysHlth_22_28,PhysHlth_29_30
0,0.0,0.0,1.0,0.0,0.0,0,0,0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,0.0,1.0,0.0,0.0,1,1,1,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0,1,0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,1.0,1.0,0.0,0.0,0.0,0,1,0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,1.0,1.0,0.0,0.0,0,0,0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [17]:
df_diabetes_part1 = df_diabetes.iloc[:5000]
df_diabetes_part2 = df_diabetes.iloc[5000:55000]
df_diabetes_part3 = df_diabetes.iloc[55000:]

df_diabetes_part1.to_csv('clean/diabetes_part1.csv', index=False)
df_diabetes_part2.to_csv('clean/diabetes_part2.csv', index=False)
df_diabetes_part3.to_csv('clean/diabetes_part3.csv', index=False)