In [27]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import MinMaxScaler

## This notebook handles the preprocessing of the diabetes dataset.

In [28]:
diabetes_data = load_diabetes(as_frame=True)
df = diabetes_data.frame

# Display the first few rows of the dataset
print("Initial Dataset:")
print(df.head())

Initial Dataset:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [29]:
# For demonstration, we'll introduce some missing values and handle them
df.loc[5:10, 'bmi'] = np.nan  
df['bmi'] = df['bmi'].fillna(df['bmi'].median())  

df = df.rename(columns={
    'age': 'age_years',
    'sex': 'gender',
    'bmi': 'body_mass_index',
    'bp': 'blood_pressure',
    's1': 'cholesterol_ldl',
    's2': 'cholesterol_hdl',
    's3': 'cholesterol_vldl',
    's4': 'serum_triglycerides',
    's5': 'serum_glucose',
    's6': 'serum_insulin'
})

In [30]:
# Add new col
df['at_risk'] = (df['serum_glucose'] > df['serum_glucose'].mean()).astype(int)
df.head()


Unnamed: 0,age_years,gender,body_mass_index,blood_pressure,cholesterol_ldl,cholesterol_hdl,cholesterol_vldl,serum_triglycerides,serum_glucose,serum_insulin,target,at_risk
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0,1
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0,0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0,1
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0,1
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0,0


In [31]:
scaler = MinMaxScaler()  # Transform numeric cols data to range [0, 1]. Age originally is [-0.1, 0.1]
numeric_columns = ['age_years', 'body_mass_index', 'blood_pressure', 
                   'cholesterol_ldl', 'cholesterol_hdl', 'cholesterol_vldl', 
                   'serum_triglycerides', 'serum_glucose', 'serum_insulin']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [32]:
print("\nCleaned Dataset:")
df.head()


Cleaned Dataset:


Unnamed: 0,age_years,gender,body_mass_index,blood_pressure,cholesterol_ldl,cholesterol_hdl,cholesterol_vldl,serum_triglycerides,serum_glucose,serum_insulin,target,at_risk
0,0.666667,0.05068,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394,151.0,1
1,0.483333,-0.044642,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667,75.0,0
2,0.883333,0.05068,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091,141.0,1
3,0.083333,-0.044642,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697,206.0,1
4,0.516667,-0.044642,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333,135.0,0


Here's some patient information:

- Patient #1 SSN: 551-32-3132    Blood Pressure: 112/78
- Patient #2 SSN: 149-07-1154    Blood Pressure: 127/85
- Patient #3 SSN: 203-78-4612    Blood Pressure: 145/96
- Patient #4 SSN: 780-02-4428    Blood Pressure: 130/80
- Patient #5 SSN: 445-58-6682    Blood Pressure: 142/103
- Patient #6 SSN: 302-11-8868    Blood Pressure: 120/72
- Patient #7 SSN: 661-23-1131    Blood Pressure: 90/60
- Patient #8 SSN: 740-06-1874    Blood Pressure: 132/88
- Patient #9 SSN: 510-10-5511    Blood Pressure: 125/76
- Patient #10 SSN: 225-33-4485    Blood Pressure: 118/73
- Patient #11 SSN: 628-08-1158    Blood Pressure: 133/88
- Patient #12 SSN: 703-44-3316    Blood Pressure: 150/96