In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

In [3]:
# Data inspection: view first 10 rows and shape
print("First ten rows of the dataset:")
display(df.head(10))

print("\nDataset dimensions:")
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

First ten rows of the dataset:


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,diabetes_status
0,Female,80.0,0,1,never,25.19,6.6,140,0,stress induced type 2 diabetic
1,Female,54.0,0,0,No Info,27.32,6.6,80,0,stress induced type 2 diabetic
2,Male,28.0,0,0,never,27.32,5.7,158,0,stress induced prediabetic
3,Female,36.0,0,0,current,23.45,5.0,155,0,non diabetic
4,Male,76.0,1,1,current,20.14,4.8,155,0,non diabetic
5,Female,20.0,0,0,never,27.32,6.6,85,0,stress induced type 2 diabetic
6,Female,44.0,0,0,never,19.31,6.5,200,1,diabetic
7,Female,79.0,0,0,No Info,23.86,5.7,85,0,stress induced prediabetic
8,Male,42.0,0,0,never,33.64,4.8,145,0,non diabetic
9,Female,32.0,0,0,never,27.32,5.0,100,0,non diabetic



Dataset dimensions:
Number of rows: 100000, Number of columns: 10


In [4]:
# Data inspection: data information
print("\nDataset information:")
df.info()


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
 9   diabetes_status      100000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 7.6+ MB


In [5]:
# Data inspection: statistical summary
print("\nStatistical summary:")
display(df.describe())


Statistical summary:


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [6]:
# Data inspection: unique values and counts
for column in df.columns:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print(f"\nUnique value counts in '{column}':")
    print(df[column].value_counts())
    print("\n")

Unique values in 'gender':
['Female' 'Male' 'Other']

Unique value counts in 'gender':
gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64


Unique values in 'age':
[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]

Unique value counts in 'age':
age
80.00    5621
51.00    1619
47.00    1574
48.00    1568
53.00    1542
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name:

In [7]:
# Drop the diabetes and hba1c columns
df = df.drop(['diabetes', 'HbA1c_level'], axis=1)

In [8]:
# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

In [9]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

In [10]:
# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

In [11]:
# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

In [12]:
# Data re inspection: view first 10 rows and shape
print("First ten rows of the dataset:")
display(df.head(10))

print("\nDataset dimensions:")
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

First ten rows of the dataset:


Unnamed: 0,age,hypertension,heart_disease,bmi,blood_glucose_level,gender_Female,gender_Male,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,diabetes_status
0,80.0,0,1,25.19,140,True,False,False,False,False,False,True,False,2
1,54.0,0,0,27.32,80,True,False,True,False,False,False,False,False,2
2,28.0,0,0,27.32,158,False,True,False,False,False,False,True,False,1
3,36.0,0,0,23.45,155,True,False,False,True,False,False,False,False,0
4,76.0,1,1,20.14,155,False,True,False,True,False,False,False,False,0
5,20.0,0,0,27.32,85,True,False,False,False,False,False,True,False,2
6,44.0,0,0,19.31,200,True,False,False,False,False,False,True,False,4
7,79.0,0,0,23.86,85,True,False,True,False,False,False,False,False,1
8,42.0,0,0,33.64,145,False,True,False,False,False,False,True,False,0
9,32.0,0,0,27.32,100,True,False,False,False,False,False,True,False,0



Dataset dimensions:
Number of rows: 99982, Number of columns: 14


In [13]:
# Data re inspection: data information
print("\nDataset information:")
df.info()


Dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 99982 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          99982 non-null  float64
 1   hypertension                 99982 non-null  int64  
 2   heart_disease                99982 non-null  int64  
 3   bmi                          99982 non-null  float64
 4   blood_glucose_level          99982 non-null  int64  
 5   gender_Female                99982 non-null  bool   
 6   gender_Male                  99982 non-null  bool   
 7   smoking_history_No Info      99982 non-null  bool   
 8   smoking_history_current      99982 non-null  bool   
 9   smoking_history_ever         99982 non-null  bool   
 10  smoking_history_former       99982 non-null  bool   
 11  smoking_history_never        99982 non-null  bool   
 12  smoking_history_not current  99982 non-null  bool   
 13 

In [14]:
# Data re inspection: statistical summary
print("\nStatistical summary:")
display(df.describe())


Statistical summary:


Unnamed: 0,age,hypertension,heart_disease,bmi,blood_glucose_level,diabetes_status
count,99982.0,99982.0,99982.0,99982.0,99982.0,99982.0
mean,41.888076,0.074863,0.039427,27.320757,138.05781,0.99944
std,22.517206,0.263172,0.19461,6.636853,40.709469,1.065033
min,0.08,0.0,0.0,10.01,80.0,0.0
25%,24.0,0.0,0.0,23.63,100.0,0.0
50%,43.0,0.0,0.0,27.32,140.0,1.0
75%,60.0,0.0,0.0,29.58,159.0,1.0
max,80.0,1.0,1.0,95.69,300.0,4.0


In [15]:
# Data re inspection: unique values and counts
for column in df.columns:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print(f"\nUnique value counts in '{column}':")
    print(df[column].value_counts())
    print("\n")

Unique values in 'age':
[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]

Unique value counts in 'age':
age
80.00    5621
51.00    1619
47.00    1572
48.00    1568
49.00    1541
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name: count, Length: 102, dtype: int64


Unique values in 'hypertension':
[0 1]

Unique value counts in 'hypertension':
hypertension
0    92497
1     7485
Name: count, dtype: 

In [16]:
# Save the modified dataset
df.to_csv('../data/preprocessed/encoded/encoded_dataset_with_label_target.csv', index=False)