In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
import seaborn as sns
np.set_printoptions(legacy='1.25')

In [2]:
path = 'Data/health_lifestyle_classification.csv'

In [3]:
data = pd.read_csv(path)
df = data.copy()

In [4]:
df.shape

(100000, 48)

In [5]:
df.head()

Unnamed: 0,survey_code,age,gender,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,blood_pressure,heart_rate,cholesterol,glucose,insulin,sleep_hours,sleep_quality,work_hours,physical_activity,daily_steps,calorie_intake,sugar_intake,alcohol_consumption,smoking_level,water_intake,screen_time,stress_level,mental_health_score,mental_health_support,education_level,job_type,occupation,income,diet_type,exercise_type,device_usage,healthcare_access,insurance,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target
0,1,56,Male,173.416872,56.88664,18.915925,18.915925,56.747776,18.989117,72.16513,118.264254,60.749825,214.580523,103.008176,,6.475885,Fair,7.671313,0.356918,13320.942595,2673.54696,44.476887,,Non-smoker,1.694262,5.003963,2,8,No,PhD,Tech,Farmer,6759.821719,Vegan,Strength,High,Poor,No,High,5,Moderate,No,Yes,0,1.0,5.5,-2.275502,healthy
1,2,69,Female,163.20738,97.799859,36.716278,36.716278,110.148833,36.511417,85.598889,117.917986,66.463696,115.794002,116.905134,10.131597,8.42841,Good,9.515198,0.568219,11911.201401,2650.376972,74.663405,Regularly,Light,0.716409,5.925455,3,9,No,High School,Office,Engineer,6240.51769,Vegan,Cardio,Moderate,Moderate,No,High,5,High,Yes,No,0,1.0,5.5,6.23934,healthy
2,3,46,Male,177.281966,80.687562,25.67305,25.67305,77.019151,25.587429,90.29503,123.073698,76.043212,138.134787,89.180302,,5.702164,Poor,5.829853,3.764406,2974.035375,1746.755144,19.702382,Regularly,Heavy,2.4879,4.37125,0,1,No,Master,Office,Teacher,3429.179266,Vegan,Cardio,High,Good,Yes,High,4,Moderate,No,No,0,1.0,5.5,5.423737,healthy
3,4,32,Female,172.101255,63.142868,21.31848,21.31848,63.95544,21.177109,100.504211,148.173453,68.781981,203.017447,128.375798,18.733179,5.188316,Good,9.489693,0.889474,5321.539497,2034.193242,82.58005,Occasionally,Heavy,2.643335,4.116064,10,4,No,Master,Labor,Teacher,2618.503534,Vegetarian,Mixed,Low,Moderate,No,High,1,,No,Yes,0,1.0,5.5,8.388611,healthy
4,5,60,Female,163.608816,40.0,14.943302,14.943302,44.829907,14.844299,69.02115,150.613181,92.335358,200.412439,94.813332,16.038701,7.912514,Good,7.27545,2.901608,9791.376712,2386.210257,45.961322,,Heavy,1.968393,3.180087,9,7,Yes,Master,Unemployed,Doctor,3662.086276,Vegan,,Low,Moderate,Yes,High,1,High,Yes,Yes,0,1.0,5.5,0.332622,healthy


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   survey_code               100000 non-null  int64  
 1   age                       100000 non-null  int64  
 2   gender                    100000 non-null  object 
 3   height                    100000 non-null  float64
 4   weight                    100000 non-null  float64
 5   bmi                       100000 non-null  float64
 6   bmi_estimated             100000 non-null  float64
 7   bmi_scaled                100000 non-null  float64
 8   bmi_corrected             100000 non-null  float64
 9   waist_size                100000 non-null  float64
 10  blood_pressure            92331 non-null   float64
 11  heart_rate                85997 non-null   float64
 12  cholesterol               100000 non-null  float64
 13  glucose                   100000 non-null  fl

In [7]:
## Missing values in blood_pressure, heart_rate, insulin, daily_steps, alcohol_consumption, income, exercise_type, caffeine_intake, gene_marker_flag
df.isnull().sum() / df.shape[0]

survey_code                 0.00000
age                         0.00000
gender                      0.00000
height                      0.00000
weight                      0.00000
bmi                         0.00000
bmi_estimated               0.00000
bmi_scaled                  0.00000
bmi_corrected               0.00000
waist_size                  0.00000
blood_pressure              0.07669
heart_rate                  0.14003
cholesterol                 0.00000
glucose                     0.00000
insulin                     0.15836
sleep_hours                 0.00000
sleep_quality               0.00000
work_hours                  0.00000
physical_activity           0.00000
daily_steps                 0.08329
calorie_intake              0.00000
sugar_intake                0.00000
alcohol_consumption         0.42387
smoking_level               0.00000
water_intake                0.00000
screen_time                 0.00000
stress_level                0.00000
mental_health_score         

In [8]:
## Drop 'survey_code' as unique identifier
df = df.drop(columns='survey_code')

## Check for Duplicates

In [9]:
## Any duplicate rows?
df.duplicated().sum()

0

In [10]:
## Find columns with identical values
identical_cols = []
for col1 in df.columns:
    for col2 in df.columns:
        if col1 < col2 and df[col1].equals(df[col2]):
            identical_cols.append((col1, col2))

identical_cols

[('bmi', 'bmi_estimated')]

In [11]:
## Drop all "bmi" columns except for "bmi_corrected" 
df.drop(columns=['bmi_estimated', 'bmi_scaled', 'bmi'], inplace=True)

## Investigate each missing variable

In [12]:
## Leave numerical columns as-is for imputation and EDA
missing_vars = []
for col in sorted(df.columns):
    if (df[col].isnull().sum()/df.shape[0]) > 0:
        missing_vars.append((col, f"{'{:,.2%}'.format(df[col].isnull().sum()/df.shape[0]).format()}"))
        
print("Variables and their Missing Percentages")
missing_vars

Variables and their Missing Percentages


[('alcohol_consumption', '42.39%'),
 ('blood_pressure', '7.67%'),
 ('caffeine_intake', '33.26%'),
 ('daily_steps', '8.33%'),
 ('exercise_type', '24.97%'),
 ('gene_marker_flag', '10.47%'),
 ('heart_rate', '14.00%'),
 ('income', '8.47%'),
 ('insulin', '15.84%')]

In [13]:
## Check values for variables with object dtype or categorical
for var in missing_vars:
    if df[var[0]].dtype == 'object' or len(df[var[0]].unique()) < 10:
        print(var[0], df[var[0]].unique())
        print("\n")

alcohol_consumption [nan 'Regularly' 'Occasionally']


caffeine_intake ['Moderate' 'High' nan]


exercise_type ['Strength' 'Cardio' 'Mixed' nan]


gene_marker_flag [ 1. nan]




In [14]:
## Fill categorical variables with "Missing"
df['alcohol_consumption'] = df['alcohol_consumption'].fillna("Missing")
df['caffeine_intake'] = df['caffeine_intake'].fillna("Missing")
df['exercise_type'] = df['exercise_type'].fillna("Missing")
df['gene_marker_flag'] = df['gene_marker_flag'].fillna("Missing")

In [15]:
print(df.shape)

(100000, 44)


In [16]:
print(data.shape)

(100000, 48)


In [17]:
df.to_csv("Data/cleanedHealthData.csv", index=False)