In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
import numpy as np

diabetes_dataset = pd.read_csv('../../../datasets/Diabetes_and_LifeStyle_Dataset.csv')
diabetes_dataset

Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,52,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.00,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.20,23.5,Type 2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97292,46,Male,Other,Graduate,Upper-Middle,Unemployed,Former,1,136,8.3,...,45,150,116,113,109,14.58,5.55,26.0,Pre-Diabetes,0
97293,41,Female,White,Graduate,Middle,Employed,Never,3,76,8.8,...,55,123,146,96,146,9.02,5.97,24.4,Pre-Diabetes,0
97294,57,Female,Black,No formal,Upper-Middle,Employed,Former,4,121,9.9,...,50,111,184,93,132,2.57,5.21,27.6,No Diabetes,0
97295,47,Female,Black,Highschool,Lower-Middle,Retired,Never,3,52,5.9,...,68,91,116,106,117,9.81,5.53,26.4,Pre-Diabetes,0


In [2]:
y = diabetes_dataset['diagnosed_diabetes']
X = diabetes_dataset.copy()
X.drop(columns=['diabetes_stage','diagnosed_diabetes'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.6,
    random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test,
    y_test,
    train_size=0.5,
    random_state=42
)

print(
    X_train.shape,
    y_train.shape,
    X_test.shape,
    y_test.shape,
    X_val.shape,
    y_val.shape
)

(58378, 29) (58378,) (19459, 29) (19459,) (19460, 29) (19460,)


In [3]:
X_train

Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score
1114,52,Male,White,Graduate,Middle,Employed,Current,4,117,7.4,...,80,198,53,127,30,105,157,12.15,6.58,24.4
36685,66,Female,White,Graduate,Lower-Middle,Retired,Never,0,114,6.6,...,75,180,59,81,126,100,165,6.94,6.68,28.7
35104,25,Male,Black,Highschool,Low,Employed,Never,3,4,6.2,...,69,190,56,100,133,122,142,14.93,6.39,24.0
4905,44,Female,White,Highschool,Middle,Employed,Never,2,44,5.5,...,75,151,55,54,94,128,168,15.65,7.21,45.0
20393,51,Male,White,Postgraduate,Middle,Employed,Never,3,182,4.1,...,75,164,57,80,112,104,137,6.19,6.37,23.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,59,Male,White,No formal,High,Employed,Current,4,216,6.6,...,59,225,42,151,126,115,183,13.89,7.50,28.5
54886,43,Male,Black,Graduate,Lower-Middle,Employed,Former,4,146,9.2,...,59,215,46,120,54,103,187,2.00,7.14,20.2
76820,65,Male,White,Postgraduate,Middle,Employed,Never,6,40,8.7,...,58,254,50,175,165,111,140,12.16,5.99,30.8
860,69,Female,Asian,Highschool,Middle,Employed,Former,1,100,6.8,...,70,186,51,113,129,142,183,2.00,7.08,29.2


In [4]:
cat_cols_low_cardinality = [col for col in X_train.columns
                            if X_train[col].dtype == 'object'
                            and len(X_train[col].unique()) < 4]
cat_cols_low_cardinality

['gender', 'smoking_status']

In [5]:
cat_cols_high_cardinality = [col for col in X_train.columns
                             if X_train[col].dtype == 'object'
                             and len(X_train[col].unique()) >= 4]
cat_cols_high_cardinality

['ethnicity', 'education_level', 'income_level', 'employment_status']

In [6]:
num_cols = [col for col in X_train.columns
            if X_train[col].dtype in ['int64', 'float64']]
num_cols

['Age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'glucose_fasting',
 'glucose_postprandial',
 'insulin_level',
 'hba1c',
 'diabetes_risk_score']

In [7]:
num_cols_high_variance = [col for col in num_cols
                          if X_train[col].std() > 1]
num_cols_high_variance

['Age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'glucose_fasting',
 'glucose_postprandial',
 'insulin_level',
 'diabetes_risk_score']

In [8]:
X_train_preprocessed = X_train.copy()

col_transformer = ColumnTransformer([
    ('cat_cols_low_cardinality', OrdinalEncoder(dtype=int), cat_cols_low_cardinality),
    ('cat_cols_high_cardinality', OneHotEncoder(dtype=int, sparse_output=False), cat_cols_high_cardinality),
    ('num_col_high_variance', StandardScaler(), num_cols_high_variance)
], verbose_feature_names_out=False, remainder='passthrough')

X_train_preprocessed = col_transformer.fit_transform(X_train_preprocessed)
X_train_preprocessed

array([[1.  , 0.  , 0.  , ..., 0.  , 0.83, 6.58],
       [0.  , 2.  , 0.  , ..., 0.  , 0.81, 6.68],
       [1.  , 2.  , 0.  , ..., 0.  , 0.83, 6.39],
       ...,
       [1.  , 2.  , 0.  , ..., 1.  , 0.81, 5.99],
       [0.  , 1.  , 1.  , ..., 0.  , 0.8 , 7.08],
       [1.  , 0.  , 0.  , ..., 0.  , 0.9 , 6.37]])

In [9]:
col_transformer.get_feature_names_out()

array(['gender', 'smoking_status', 'ethnicity_Asian', 'ethnicity_Black',
       'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White',
       'education_level_Graduate', 'education_level_Highschool',
       'education_level_No formal', 'education_level_Postgraduate',
       'income_level_High', 'income_level_Low',
       'income_level_Lower-Middle', 'income_level_Middle',
       'income_level_Upper-Middle', 'employment_status_Employed',
       'employment_status_Retired', 'employment_status_Student',
       'employment_status_Unemployed', 'Age',
       'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total',
       'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
       'glucose_fasting', 'glucose_postprandial', 'insulin_level',
       'diabetes_risk_score', 'family_history_diabetes',
       'hypertension

In [10]:
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=col_transformer.get_feature_names_out())
X_train_preprocessed_df

Unnamed: 0,gender,smoking_status,ethnicity_Asian,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,education_level_Graduate,education_level_Highschool,education_level_No formal,...,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,diabetes_risk_score,family_history_diabetes,hypertension_history,cardiovascular_history,waist_to_hip_ratio,hba1c
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,-2.113326,-0.451595,-0.098731,0.625081,-0.645370,0.0,0.0,0.0,0.83,6.58
1,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.102197,-0.819331,0.159939,-0.427862,-0.172051,0.0,0.0,0.0,0.81,6.68
2,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.263746,0.798710,-0.583735,1.186920,-0.689399,0.0,0.0,0.0,0.83,6.39
3,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-0.636310,1.239994,0.256939,1.332432,1.622158,1.0,0.0,0.0,0.95,7.21
4,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.220900,-0.525142,-0.745404,-0.579437,-0.711414,0.0,0.0,0.0,0.83,6.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58373,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.102197,0.283879,0.741944,0.976736,-0.194066,0.0,0.0,0.0,0.92,7.50
58374,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.559445,-0.598689,0.871279,-1.426238,-1.107681,0.0,0.0,0.0,0.80,7.14
58375,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.002254,-0.010311,-0.648403,0.627102,0.059105,0.0,1.0,1.0,0.81,5.99
58376,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.171433,2.269656,0.741944,-1.426238,-0.117014,0.0,0.0,0.0,0.80,7.08


In [11]:
X_train_preprocessed_df.columns

Index(['gender', 'smoking_status', 'ethnicity_Asian', 'ethnicity_Black',
       'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White',
       'education_level_Graduate', 'education_level_Highschool',
       'education_level_No formal', 'education_level_Postgraduate',
       'income_level_High', 'income_level_Low', 'income_level_Lower-Middle',
       'income_level_Middle', 'income_level_Upper-Middle',
       'employment_status_Employed', 'employment_status_Retired',
       'employment_status_Student', 'employment_status_Unemployed', 'Age',
       'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
       'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total',
       'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
       'glucose_fasting', 'glucose_postprandial', 'insulin_level',
       'diabetes_risk_score', 'family_history_diabetes',
       'hypertension_history', 'ca

In [12]:
cat_cols = [col for col in X_train_preprocessed_df.columns
            if col not in num_cols]
X_train_preprocessed_df[cat_cols] = X_train_preprocessed_df[cat_cols].astype(int)
X_train_preprocessed_df.head()

Unnamed: 0,gender,smoking_status,ethnicity_Asian,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,education_level_Graduate,education_level_Highschool,education_level_No formal,...,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,diabetes_risk_score,family_history_diabetes,hypertension_history,cardiovascular_history,waist_to_hip_ratio,hba1c
0,1,0,0,0,0,0,1,1,0,0,...,-2.113326,-0.451595,-0.098731,0.625081,-0.64537,0.0,0.0,0.0,0.83,6.58
1,0,2,0,0,0,0,1,1,0,0,...,0.102197,-0.819331,0.159939,-0.427862,-0.172051,0.0,0.0,0.0,0.81,6.68
2,1,2,0,1,0,0,0,0,1,0,...,0.263746,0.79871,-0.583735,1.18692,-0.689399,0.0,0.0,0.0,0.83,6.39
3,0,2,0,0,0,0,1,0,1,0,...,-0.63631,1.239994,0.256939,1.332432,1.622158,1.0,0.0,0.0,0.95,7.21
4,1,2,0,0,0,0,1,0,0,0,...,-0.2209,-0.525142,-0.745404,-0.579437,-0.711414,0.0,0.0,0.0,0.83,6.37
