In [1]:
import joblib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import (OneHotEncoder, OrdinalEncoder, StandardScaler)


# Load Data

In [2]:
data = pd.read_csv('diabetes_dataset.csv')

In [3]:
data.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


# Data Cleaning

In [4]:
data.isnull().sum().sum()

np.int64(0)

In [5]:
data.duplicated().sum().sum()

np.int64(0)

In [6]:
correlations = data.corr(numeric_only=True)['diagnosed_diabetes'].sort_values(ascending=False)
correlations

diagnosed_diabetes                    1.000000
hba1c                                 0.679397
glucose_postprandial                  0.629832
glucose_fasting                       0.510919
diabetes_risk_score                   0.277300
family_history_diabetes               0.197926
age                                   0.137713
bmi                                   0.097057
systolic_bp                           0.095481
waist_to_hip_ratio                    0.078918
ldl_cholesterol                       0.067475
cholesterol_total                     0.058173
insulin_level                         0.057715
triglycerides                         0.056230
diastolic_bp                          0.035619
cardiovascular_history                0.029793
hypertension_history                  0.027524
heart_rate                            0.022785
screen_time_hours_per_day             0.018127
alcohol_consumption_per_week          0.000760
sleep_hours_per_day                  -0.000399
diet_score   

In [8]:
# حذف الأعمدة اللي بتسرب المعلومة أو اللي ملهاش لازمة

cols_to_drop = ['diabetes_risk_score', 'alcohol_consumption_per_week', 'sleep_hours_per_day', 'diabetes_stage']

data = data.drop(columns=cols_to_drop)

In [20]:
data.columns

Index(['age', 'gender', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'smoking_status',
       'physical_activity_minutes_per_week', 'diet_score',
       'screen_time_hours_per_day', 'family_history_diabetes',
       'hypertension_history', 'cardiovascular_history', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'glucose_fasting', 'glucose_postprandial',
       'insulin_level', 'hba1c', 'diagnosed_diabetes'],
      dtype='object')

# Data Transformation

## Handel Categorical Features

In [9]:
Categorical_data = data.select_dtypes(include=['object'])

In [10]:
Categorical_data.head()

Unnamed: 0,gender,ethnicity,education_level,income_level,employment_status,smoking_status
0,Male,Asian,Highschool,Lower-Middle,Employed,Never
1,Female,White,Highschool,Middle,Employed,Former
2,Male,Hispanic,Highschool,Middle,Unemployed,Never
3,Female,Black,Highschool,Low,Retired,Never
4,Male,White,Graduate,Middle,Retired,Never


In [11]:
Categorical_features = Categorical_data.columns.tolist()
Categorical_features

['gender',
 'ethnicity',
 'education_level',
 'income_level',
 'employment_status',
 'smoking_status']

In [12]:
Categorical_data['education_level'].value_counts()

education_level
Highschool      44891
Graduate        35037
Postgraduate    14972
No formal        5100
Name: count, dtype: int64

In [13]:
Categorical_data['income_level'].value_counts()

income_level
Middle          35152
Lower-Middle    25150
Upper-Middle    19866
Low             14830
High             5002
Name: count, dtype: int64

In [14]:
# Ordinal Features Orders
education_level_order = ['No formal', 'Highschool', 'Graduate', 'Postgraduate']
income_level_order = ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']

ordinal_features = ['education_level', 'income_level']
nominal_features = ['gender', 'ethnicity', 'employment_status', 'smoking_status']

In [15]:
oe = OrdinalEncoder(categories=[education_level_order, income_level_order])

encoded_df = data.copy()
encoded_df[ordinal_features] = oe.fit_transform(data[ordinal_features])

ohe = OneHotEncoder(drop= 'if_binary', sparse_output=False)
ohe_features = ohe.fit_transform(data[nominal_features])

In [16]:

# Put the output matrix in a dataframe with proper column names
ohe_features_df = pd.DataFrame(ohe_features, columns=ohe.get_feature_names_out())

# Add the new OneHotEncoded columns to the dataframe
encoded_df = pd.concat((encoded_df, ohe_features_df), axis=1)

# Remove the old ones
encoded_df.drop(nominal_features, axis=1, inplace=True)

In [17]:
encoded_df.head()

Unnamed: 0,age,education_level,income_level,physical_activity_minutes_per_week,diet_score,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,...,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Current,smoking_status_Former,smoking_status_Never
0,58,1.0,1.0,215,5.7,7.9,0,0,0,30.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,48,1.0,2.0,143,6.7,8.7,0,0,0,23.1,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60,1.0,2.0,57,6.4,8.1,1,0,0,22.2,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,74,1.0,0.0,49,3.4,5.2,0,0,0,26.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,46,2.0,2.0,109,7.2,5.0,0,0,0,21.2,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [18]:
joblib.dump(oe, 'OrdinalEncoder_diabetes.pkl')

['OrdinalEncoder_diabetes.pkl']

In [19]:
joblib.dump(ohe, 'OneHotEncoder_diabetes.pkl')

['OneHotEncoder_diabetes.pkl']

## Normalize numerical features

In [37]:
scaler = StandardScaler()

In [47]:
one_hot_features = ohe_features_df.columns.tolist()

features_not_std = encoded_df[ordinal_features + one_hot_features + ['diagnosed_diabetes']]
features_std = encoded_df.drop(ordinal_features + one_hot_features + ['diagnosed_diabetes'], axis=1)

Matrix_std = scaler.fit_transform(features_std)

features_new = pd.DataFrame(Matrix_std, columns=features_std.columns)
data_std = pd.concat([features_new, features_not_std], axis=1)
data_std.head()


Unnamed: 0,age,physical_activity_minutes_per_week,diet_score,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,...,ethnicity_Other,ethnicity_White,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Current,smoking_status_Former,smoking_status_Never,diagnosed_diabetes
0,0.504956,1.138363,-0.165523,0.771162,-0.530172,-0.578582,-0.293278,1.362636,0.724256,1.27418,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,-0.135884,0.285376,0.395977,1.09526,-0.530172,-0.578582,-0.293278,-0.70055,-1.1973,0.924138,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,0.633124,-0.73347,0.227527,0.852187,1.886181,-0.578582,-0.293278,-0.951478,-0.983794,-0.055979,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,1.530299,-0.828246,-1.456972,-0.322667,-0.530172,-0.578582,-0.293278,0.331043,0.51075,0.294063,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,-0.264052,-0.117424,0.676727,-0.403691,-0.530172,-0.578582,-0.293278,-1.230287,-1.624313,-1.666172,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [48]:
joblib.dump(scaler, 'scaler_diabetes.pkl')

['scaler_diabetes.pkl']

In [49]:
data_std.to_csv('diabetes_final.csv')