# Importing libraries & data ingestion

In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings("ignore")

In [46]:
df=pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   id                                  700000 non-null  int64  
 1   age                                 700000 non-null  int64  
 2   alcohol_consumption_per_week        700000 non-null  int64  
 3   physical_activity_minutes_per_week  700000 non-null  int64  
 4   diet_score                          700000 non-null  float64
 5   sleep_hours_per_day                 700000 non-null  float64
 6   screen_time_hours_per_day           700000 non-null  float64
 7   bmi                                 700000 non-null  float64
 8   waist_to_hip_ratio                  700000 non-null  float64
 9   systolic_bp                         700000 non-null  int64  
 10  diastolic_bp                        700000 non-null  int64  
 11  heart_rate                

### 1. Indentifying numerical & categorical columns

In [48]:
categorical_cols = df.select_dtypes(include=["object"])
numerical_cols = df.select_dtypes(exclude=['object'])
categorical_cols

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
0,Female,Hispanic,Highschool,Lower-Middle,Current,Employed
1,Female,White,Highschool,Upper-Middle,Never,Employed
2,Male,Hispanic,Highschool,Lower-Middle,Never,Retired
3,Female,White,Highschool,Lower-Middle,Current,Employed
4,Male,White,Highschool,Upper-Middle,Never,Retired
...,...,...,...,...,...,...
699995,Female,Hispanic,Postgraduate,Upper-Middle,Former,Employed
699996,Female,Hispanic,Graduate,Upper-Middle,Former,Employed
699997,Female,White,Graduate,Middle,Never,Employed
699998,Female,White,Highschool,Lower-Middle,Never,Retired


* Since some columns are binary in nature, while others have a relation of order in between them

* So, let's seperate the columns which will be Ordinal encoded & One hot encoded

In [49]:
ordinal_encoding_cols = categorical_cols[['education_level','income_level']]
ordinal_encoding_cols

Unnamed: 0,education_level,income_level
0,Highschool,Lower-Middle
1,Highschool,Upper-Middle
2,Highschool,Lower-Middle
3,Highschool,Lower-Middle
4,Highschool,Upper-Middle
...,...,...
699995,Postgraduate,Upper-Middle
699996,Graduate,Upper-Middle
699997,Graduate,Middle
699998,Highschool,Lower-Middle


In [50]:
one_hot_encoding_cols = categorical_cols[['gender','ethnicity','smoking_status','employment_status']]
one_hot_encoding_cols

Unnamed: 0,gender,ethnicity,smoking_status,employment_status
0,Female,Hispanic,Current,Employed
1,Female,White,Never,Employed
2,Male,Hispanic,Never,Retired
3,Female,White,Current,Employed
4,Male,White,Never,Retired
...,...,...,...,...
699995,Female,Hispanic,Former,Employed
699996,Female,Hispanic,Former,Employed
699997,Female,White,Never,Employed
699998,Female,White,Never,Retired


#### 1.1 One-hot encoding categorical columns

In [51]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop="first")
one_encoded_array = ohe.fit_transform(one_hot_encoding_cols)
one_encoded_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

* Since the output is a 2-D array & not visually interpretable, let's fetch the column names & make a dataframe out of it

In [52]:
one_hot_encoded_col_names = ohe.get_feature_names_out(one_hot_encoding_cols.columns)
one_hot_encoded_df = pd.DataFrame(one_encoded_array,columns = one_hot_encoded_col_names,index=df.index)
one_hot_encoded_df

Unnamed: 0,gender_Male,gender_Other,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,smoking_status_Former,smoking_status_Never,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
699995,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
699996,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
699997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
699998,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


### 1.2 Ordinal encoding categorical columns

In [53]:
ordinal_encoding_cols

Unnamed: 0,education_level,income_level
0,Highschool,Lower-Middle
1,Highschool,Upper-Middle
2,Highschool,Lower-Middle
3,Highschool,Lower-Middle
4,Highschool,Upper-Middle
...,...,...
699995,Postgraduate,Upper-Middle
699996,Graduate,Upper-Middle
699997,Graduate,Middle
699998,Highschool,Lower-Middle


* Since ordinal encoder needs value to be explicitly listed in order or else it will encode the values in alphabetical order.

* Let's get the unique values of these columns & explicitly define the order of encoding; from low to high i.e. 0 to max

In [54]:
ordinal_encoding_cols['education_level'].unique()

array(['Highschool', 'Graduate', 'Postgraduate', 'No formal'],
      dtype=object)

In [55]:
ordinal_encoding_cols['income_level'].unique()

array(['Lower-Middle', 'Upper-Middle', 'Low', 'Middle', 'High'],
      dtype=object)

In [56]:
education_level_order = ['No formal','Highschool','Graduate','Postgraduate']
income_level_order = ['Low','Lower-Middle','Middle','Upper-Middle','High']

In [57]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[education_level_order,income_level_order])

ordinal_encoded = oe.fit_transform(ordinal_encoding_cols)
ordinal_encoded

array([[1., 1.],
       [1., 3.],
       [1., 1.],
       ...,
       [2., 2.],
       [1., 1.],
       [2., 0.]])

* Since the output is a 2-D array & not visually interpretable, let's fetch the column names & make a dataframe out of it

In [58]:
ordinal_column_names = oe.get_feature_names_out(ordinal_encoding_cols.columns)

ordinal_encoded_df = pd.DataFrame(ordinal_encoded,columns = ordinal_column_names,index=df.index)
ordinal_encoded_df

Unnamed: 0,education_level,income_level
0,1.0,1.0
1,1.0,3.0
2,1.0,1.0
3,1.0,1.0
4,1.0,3.0
...,...,...
699995,3.0,3.0
699996,2.0,3.0
699997,2.0,2.0
699998,1.0,1.0


### 1.3 Merging the feature engineered categorical columns into one dataframe

In [59]:
feature_engineered_categorical_df = pd.concat([one_hot_encoded_df,ordinal_encoded_df],axis=1)
feature_engineered_categorical_df.head()

Unnamed: 0,gender_Male,gender_Other,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,smoking_status_Former,smoking_status_Never,employment_status_Retired,employment_status_Student,employment_status_Unemployed,education_level,income_level
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0


In [60]:
feature_engineered_categorical_df.shape

(700000, 13)

In [64]:
numerical_cols

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.90,108,60,85,206,49,131,124,0,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699995,699995,29,1,59,6.9,5.2,1.5,26.1,0.88,133,57,69,163,58,90,126,0,0,0,0.0
699996,699996,46,2,72,7.7,7.7,3.8,25.5,0.85,106,85,65,188,45,107,119,0,0,1,1.0
699997,699997,35,1,50,5.6,6.1,6.4,26.9,0.88,127,84,63,168,59,77,166,0,0,0,1.0
699998,699998,49,2,70,5.7,6.9,4.7,25.2,0.86,116,67,69,198,55,108,133,0,0,0,1.0


In [65]:
numerical_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 20 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   id                                  700000 non-null  int64  
 1   age                                 700000 non-null  int64  
 2   alcohol_consumption_per_week        700000 non-null  int64  
 3   physical_activity_minutes_per_week  700000 non-null  int64  
 4   diet_score                          700000 non-null  float64
 5   sleep_hours_per_day                 700000 non-null  float64
 6   screen_time_hours_per_day           700000 non-null  float64
 7   bmi                                 700000 non-null  float64
 8   waist_to_hip_ratio                  700000 non-null  float64
 9   systolic_bp                         700000 non-null  int64  
 10  diastolic_bp                        700000 non-null  int64  
 11  heart_rate                

In [66]:
numerical_cols = numerical_cols.drop('id',axis=1)
numerical_cols

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0,1.0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0,1.0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0,0.0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0,1.0
4,54,1,55,5.7,6.2,5.1,28.8,0.90,108,60,85,206,49,131,124,0,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699995,29,1,59,6.9,5.2,1.5,26.1,0.88,133,57,69,163,58,90,126,0,0,0,0.0
699996,46,2,72,7.7,7.7,3.8,25.5,0.85,106,85,65,188,45,107,119,0,0,1,1.0
699997,35,1,50,5.6,6.1,6.4,26.9,0.88,127,84,63,168,59,77,166,0,0,0,1.0
699998,49,2,70,5.7,6.9,4.7,25.2,0.86,116,67,69,198,55,108,133,0,0,0,1.0


#### 1.4 Merging numerical & feature engineered categorical columns into one finalized dataframe

In [68]:
final_df = pd.concat([numerical_cols,feature_engineered_categorical_df],axis=1)
final_df.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,smoking_status_Former,smoking_status_Never,employment_status_Retired,employment_status_Student,employment_status_Unemployed,education_level,income_level
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0


In [69]:
final_df.shape

(700000, 32)

# 2. Target attribute imbalance check

In [70]:
final_df['diagnosed_diabetes'].value_counts()

diagnosed_diabetes
1.0    436307
0.0    263693
Name: count, dtype: int64

In [71]:
target_props = final_df['diagnosed_diabetes'].value_counts(normalize=True)
print(target_props)


diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


* (62% / 37%) is mild imbalance — many models will handle this naturally, especially if :

* I use stratified train-test split (to preserve distribution)


# 3. Statified Train-Test split for Model Training

In [73]:
final_df.columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes', 'gender_Male',
       'gender_Other', 'ethnicity_Black', 'ethnicity_Hispanic',
       'ethnicity_Other', 'ethnicity_White', 'smoking_status_Former',
       'smoking_status_Never', 'employment_status_Retired',
       'employment_status_Student', 'employment_status_Unemployed',
       'education_level', 'income_level'],
      dtype='object')

In [79]:
X = final_df.drop('diagnosed_diabetes',axis=1) # independent attribute
y = final_df['diagnosed_diabetes'] # target attribute

In [77]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42, stratify= y) #startify ensures class proportions are maintained


* Checking the split proportions

In [80]:
print("Train target attribute distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest target attribute distribution:")
print(y_test.value_counts(normalize=True))


Train target attribute distribution:
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64

Test target attribute distribution:
diagnosed_diabetes
1.0    0.623293
0.0    0.376707
Name: proportion, dtype: float64


### 3.1 Putting train/tests set back together & saving as CSV for further workflows

In [81]:
train_df = X_train.copy()
train_df['diagnosed_diabetes'] = y_train

test_df = X_test.copy()
test_df['diagnosed_diabetes'] = y_test

In [None]:
train_df.to_csv('data_for_model_training/train_data.csv', index=False)
test_df.to_csv('data_for_model_training/test_data.csv', index=False)

# **Feature engineering complete**
# Next Steps: Moving to Model training & evaluation in seperate notebook