In [1]:
import joblib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import (OneHotEncoder, OrdinalEncoder, StandardScaler)

# Load Data

In [2]:
original_data = pd.read_csv(r"C:\Users\USER\AI-Projects\ML Projects\Diabetes Prediction\Synthetic Data\Dataset\diabetes_dataset.csv")
train_data = pd.read_csv(r"C:\Users\USER\AI-Projects\ML Projects\Diabetes Prediction\Synthetic Data\Dataset\train.csv")
test_data = pd.read_csv(r"C:\Users\USER\AI-Projects\ML Projects\Diabetes Prediction\Synthetic Data\Dataset\test.csv")

In [3]:
diff_features = [col for col in original_data.columns if col not in train_data.columns]
diff_features

['glucose_fasting',
 'glucose_postprandial',
 'insulin_level',
 'hba1c',
 'diabetes_risk_score',
 'diabetes_stage']

In [4]:
original_data_new = original_data.drop(diff_features, axis=1)
train_data_new = train_data.drop('id', axis=1)

In [5]:
diff_features = [col for col in train_data_new.columns if col not in original_data_new.columns]
diff_features

[]

In [6]:
diff_features = [col for col in original_data_new.columns if col not in train_data_new.columns]
diff_features

[]

In [7]:
new_data = pd.concat([train_data_new, original_data_new])
new_data = new_data.sample(frac=1, random_state=42).reset_index(drop=True)
new_data.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,59,2,72,7.4,7.7,5.1,22.6,0.81,118,82,...,Male,White,Graduate,Middle,Current,Employed,0,0,0,1.0
1,53,1,82,4.2,7.4,5.3,24.2,0.84,122,82,...,Male,Hispanic,Postgraduate,Lower-Middle,Never,Employed,0,0,0,0.0
2,49,3,66,7.0,7.7,7.4,20.3,0.79,108,72,...,Female,White,Highschool,Middle,Never,Employed,0,0,0,1.0
3,46,2,97,7.1,7.5,9.5,25.2,0.86,127,76,...,Female,White,Highschool,Middle,Never,Unemployed,0,0,0,1.0
4,49,2,34,6.0,8.1,6.0,22.2,0.79,127,71,...,Male,White,Postgraduate,Middle,Never,Employed,0,0,0,0.0


In [8]:
#new_data.to_csv('new_diabetes_dataset.csv', index=False)

# Data Transformation

## Handel Categorical Features

In [9]:
Categorical_data = new_data.select_dtypes(include=['object'])
Categorical_data.head()

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
0,Male,White,Graduate,Middle,Current,Employed
1,Male,Hispanic,Postgraduate,Lower-Middle,Never,Employed
2,Female,White,Highschool,Middle,Never,Employed
3,Female,White,Highschool,Middle,Never,Unemployed
4,Male,White,Postgraduate,Middle,Never,Employed


In [10]:
Categorical_features = Categorical_data.columns.tolist()
Categorical_features

['gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status']

In [11]:
Categorical_data['education_level'].value_counts()

education_level
Highschool      389036
Graduate        296305
Postgraduate     94614
No formal        20045
Name: count, dtype: int64

In [12]:
Categorical_data['income_level'].value_counts()

income_level
Middle          325709
Lower-Middle    203720
Upper-Middle    147702
Low             100633
High             22236
Name: count, dtype: int64

In [13]:
# Ordinal Features Orders
education_level_order = ['No formal', 'Highschool', 'Graduate', 'Postgraduate']
income_level_order = ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']

ordinal_features = ['education_level', 'income_level']
nominal_features = ['gender', 'ethnicity', 'employment_status', 'smoking_status']

In [14]:
oe = OrdinalEncoder(categories=[education_level_order, income_level_order])

encoded_df = new_data.copy()
encoded_df[ordinal_features] = oe.fit_transform(new_data[ordinal_features])

ohe = OneHotEncoder(drop= 'if_binary', sparse_output=False)
ohe_features = ohe.fit_transform(new_data[nominal_features])

In [15]:
# Put the output matrix in a dataframe with proper column names
ohe_features_df = pd.DataFrame(ohe_features, columns=ohe.get_feature_names_out())

# Add the new OneHotEncoded columns to the dataframe
encoded_df = pd.concat((encoded_df, ohe_features_df), axis=1)

# Remove the old ones
encoded_df.drop(nominal_features, axis=1, inplace=True)

In [16]:
encoded_df.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Current,smoking_status_Former,smoking_status_Never
0,59,2,72,7.4,7.7,5.1,22.6,0.81,118,82,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,53,1,82,4.2,7.4,5.3,24.2,0.84,122,82,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,49,3,66,7.0,7.7,7.4,20.3,0.79,108,72,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,46,2,97,7.1,7.5,9.5,25.2,0.86,127,76,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,49,2,34,6.0,8.1,6.0,22.2,0.79,127,71,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Normalize numerical features

In [17]:
scaler = StandardScaler()

In [18]:
one_hot_features = ohe_features_df.columns.tolist()

features_not_std = encoded_df[ordinal_features + one_hot_features + ['diagnosed_diabetes']]
features_std = encoded_df.drop(ordinal_features + one_hot_features + ['diagnosed_diabetes'], axis=1)

Matrix_std = scaler.fit_transform(features_std)

features_new = pd.DataFrame(Matrix_std, columns=features_std.columns)
data_std = pd.concat([features_new, features_not_std], axis=1)
data_std.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,ethnicity_Other,ethnicity_White,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Current,smoking_status_Former,smoking_status_Never,diagnosed_diabetes
0,0.709543,-0.057942,-0.225826,0.950676,0.752379,-0.437071,-1.094334,-1.235246,0.154083,0.938956,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.21852,-0.965856,-0.05299,-1.173118,0.429168,-0.341085,-0.554244,-0.47007,0.502759,0.938956,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.108829,0.849972,-0.329528,0.685201,0.752379,0.666765,-1.870714,-1.745363,-0.717608,-0.486917,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,-0.35434,-0.057942,0.206264,0.75157,0.536905,1.674614,-0.216688,0.040047,0.938604,0.083432,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,-0.108829,-0.057942,-0.882604,0.021516,1.183326,-0.005135,-1.229357,-1.745363,0.938604,-0.629504,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
joblib.dump(scaler, 'scaler_diabetes_synthetic.pkl')

['scaler_diabetes_synthetic.pkl']

In [20]:
joblib.dump(oe, 'OrdinalEncoder_diabetes_synthetic.pkl')

['OrdinalEncoder_diabetes_synthetic.pkl']

In [21]:
joblib.dump(ohe, 'OneHotEncoder_diabetes_synthetic.pkl')

['OneHotEncoder_diabetes_synthetic.pkl']

In [22]:
#data_std.to_csv('diabetes_synthetic_final.csv')