In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

diabetes_dataset = pd.read_csv('../../../../datasets/Diabetes_and_LifeStyle_Dataset.csv')
diabetes_dataset

Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,52,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.00,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.20,23.5,Type 2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97292,46,Male,Other,Graduate,Upper-Middle,Unemployed,Former,1,136,8.3,...,45,150,116,113,109,14.58,5.55,26.0,Pre-Diabetes,0
97293,41,Female,White,Graduate,Middle,Employed,Never,3,76,8.8,...,55,123,146,96,146,9.02,5.97,24.4,Pre-Diabetes,0
97294,57,Female,Black,No formal,Upper-Middle,Employed,Former,4,121,9.9,...,50,111,184,93,132,2.57,5.21,27.6,No Diabetes,0
97295,47,Female,Black,Highschool,Lower-Middle,Retired,Never,3,52,5.9,...,68,91,116,106,117,9.81,5.53,26.4,Pre-Diabetes,0


In [2]:
y = diabetes_dataset['diagnosed_diabetes']
X = diabetes_dataset.copy()
X.drop(columns=['diagnosed_diabetes', 'diabetes_stage'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.6,
    random_state=42
)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(58378, 29) (58378,) (38919, 29) (38919,)


In [3]:
cat_col_low_card = [col for col in X_train.columns
                   if X_train[col].dtype == 'object'
                   and len(X_train[col].unique()) < 4]
cat_col_low_card

['gender', 'smoking_status']

In [4]:
ordinal_encoder = OrdinalEncoder(dtype=int)
X_train[cat_col_low_card] = ordinal_encoder.fit_transform(X_train[cat_col_low_card])
X_train[cat_col_low_card]

Unnamed: 0,gender,smoking_status
1114,1,0
36685,0,2
35104,1,2
4905,0,2
20393,1,2
...,...,...
6265,1,0
54886,1,1
76820,1,2
860,0,1


In [5]:
cat_col_high_cadinality = [col for col in X_train.columns
                          if X_train[col].dtype == 'object'
                          and len(X_train[col].unique()) >= 4]
cat_col_high_cadinality

['ethnicity', 'education_level', 'income_level', 'employment_status']

In [6]:
one_hot_encoder = OneHotEncoder(dtype=int, sparse_output=False)
cat_col_high_encoded_df = pd.DataFrame(
    one_hot_encoder.fit_transform(X_train[cat_col_high_cadinality]),
    columns=one_hot_encoder.get_feature_names_out() 
)
cat_col_high_encoded_df.shape

(58378, 18)

In [7]:
X_train.shape

(58378, 29)

In [8]:
X_train.drop(columns=cat_col_high_cadinality, inplace=True)
X_train.shape

(58378, 25)

In [9]:
X_train.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train, cat_col_high_encoded_df], axis=1)
X_train

Unnamed: 0,Age,gender,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,...,education_level_Postgraduate,income_level_High,income_level_Low,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,52,1,0,4,117,7.4,6.9,7.9,0,0,...,0,0,0,0,1,0,1,0,0,0
1,66,0,2,0,114,6.6,7.7,1.9,0,0,...,0,0,0,1,0,0,0,1,0,0
2,25,1,2,3,4,6.2,5.8,10.4,0,0,...,0,0,1,0,0,0,1,0,0,0
3,44,0,2,2,44,5.5,8.1,8.5,1,0,...,0,0,0,0,1,0,1,0,0,0
4,51,1,2,3,182,4.1,7.5,4.7,0,0,...,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58373,59,1,0,4,216,6.6,5.7,3.1,0,0,...,0,1,0,0,0,0,1,0,0,0
58374,43,1,1,4,146,9.2,8.3,9.2,0,0,...,0,0,0,1,0,0,1,0,0,0
58375,65,1,2,6,40,8.7,7.2,0.5,0,1,...,1,0,0,0,1,0,1,0,0,0
58376,69,0,1,1,100,6.8,8.3,3.2,0,0,...,0,0,0,0,1,0,1,0,0,0
