In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

diabetes_dataset = pd.read_csv('../../../../datasets/Diabetes_and_LifeStyle_Dataset.csv')
diabetes_dataset.shape

(97297, 31)

In [2]:
y = diabetes_dataset['diagnosed_diabetes']
X = diabetes_dataset.copy()
X.drop(columns=['diagnosed_diabetes', 'diabetes_stage'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.6,
    random_state=42
)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(58378, 29) (58378,) (38919, 29) (38919,)


In [3]:
cat_col_low_card = [col for col in X_train.columns
                   if X_train[col].dtype == 'object'
                   and len(X_train[col].unique()) < 4]
cat_col_low_card

['gender', 'smoking_status']

In [4]:
ordinal_encoder = OrdinalEncoder(dtype=int)
X_train[cat_col_low_card] = ordinal_encoder.fit_transform(X_train[cat_col_low_card])
X_train[cat_col_low_card]

Unnamed: 0,gender,smoking_status
1114,1,0
36685,0,2
35104,1,2
4905,0,2
20393,1,2
...,...,...
6265,1,0
54886,1,1
76820,1,2
860,0,1


In [5]:
cat_col_high_cadinality = [col for col in X_train.columns
                          if X_train[col].dtype == 'object'
                          and len(X_train[col].unique()) >= 4]
cat_col_high_cadinality

['ethnicity', 'education_level', 'income_level', 'employment_status']

In [6]:
one_hot_encoder = OneHotEncoder(dtype=int, sparse_output=False)
cat_col_high_encoded_df = pd.DataFrame(
    one_hot_encoder.fit_transform(X_train[cat_col_high_cadinality]),
    columns=one_hot_encoder.get_feature_names_out() 
)
cat_col_high_encoded_df.shape

(58378, 18)

In [7]:
X_train.shape
# X_train.drop(columns=cat_col_high_cadinality, inplace=True, axis=1)
# X_train = pd.concat([X_train, cat_col_high_encoded_df], axis=1)
# X_train

(58378, 29)

In [8]:
X_train.drop(columns=cat_col_high_cadinality, inplace=True)
X_train.shape

(58378, 25)

In [9]:
X_train = pd.concat([X_train, cat_col_high_encoded_df], axis=1)
X_train

Unnamed: 0,Age,gender,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,...,education_level_Postgraduate,income_level_High,income_level_Low,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed
1114,52.0,1.0,0.0,4.0,117.0,7.4,6.9,7.9,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
36685,66.0,0.0,2.0,0.0,114.0,6.6,7.7,1.9,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
35104,25.0,1.0,2.0,3.0,4.0,6.2,5.8,10.4,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4905,44.0,0.0,2.0,2.0,44.0,5.5,8.1,8.5,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
20393,51.0,1.0,2.0,3.0,182.0,4.1,7.5,4.7,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58371,,,,,,,,,,,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
58372,,,,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
58374,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
58375,,,,,,,,,,,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
