In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import os

In [13]:
file_path = "../../Data/diabetes_dataset.csv"

In [14]:
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Target,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,...,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,Steroid-Induced Diabetes,Positive,Negative,No,Present,40,44,38,High,Healthy,...,76,No,No,Positive,3,Normal,56,Ketones Present,2629,No
1,Neonatal Diabetes Mellitus (NDM),Positive,Negative,No,Present,13,1,17,High,Healthy,...,60,Yes,No,Negative,1,Normal,28,Glucose Present,1881,Yes
2,Prediabetic,Positive,Positive,Yes,Present,27,36,24,High,Unhealthy,...,80,Yes,No,Negative,1,Abnormal,55,Ketones Present,3622,Yes
3,Type 1 Diabetes,Negative,Positive,No,Present,8,7,16,Low,Unhealthy,...,89,Yes,No,Positive,2,Abnormal,60,Ketones Present,3542,No
4,Wolfram Syndrome,Negative,Negative,Yes,Present,17,10,17,High,Healthy,...,41,No,No,Positive,1,Normal,24,Protein Present,1770,No
5,LADA,Positive,Negative,Yes,Present,17,41,26,Moderate,Healthy,...,85,Yes,No,Negative,2,Normal,52,Ketones Present,3835,Yes
6,Type 2 Diabetes,Negative,Negative,No,Absent,29,30,31,Moderate,Healthy,...,64,Yes,Yes,Negative,3,Abnormal,96,Ketones Present,4426,No
7,Wolcott-Rallison Syndrome,Positive,Negative,No,Absent,10,3,18,Low,Unhealthy,...,44,Yes,No,Negative,1,Normal,29,Ketones Present,1644,Yes
8,Secondary Diabetes,Negative,Positive,No,Absent,47,47,25,High,Healthy,...,71,No,Yes,Positive,3,Normal,74,Ketones Present,3721,No
9,Secondary Diabetes,Positive,Negative,Yes,Present,21,72,24,Low,Unhealthy,...,69,Yes,Yes,Positive,2,Abnormal,42,Protein Present,4206,No


In [4]:
list_columns = df.columns.tolist()
print(list_columns)

['Target', 'Genetic Markers', 'Autoantibodies', 'Family History', 'Environmental Factors', 'Insulin Levels', 'Age', 'BMI', 'Physical Activity', 'Dietary Habits', 'Blood Pressure', 'Cholesterol Levels', 'Waist Circumference', 'Blood Glucose Levels', 'Ethnicity', 'Socioeconomic Factors', 'Smoking Status', 'Alcohol Consumption', 'Glucose Tolerance Test', 'History of PCOS', 'Previous Gestational Diabetes', 'Pregnancy History', 'Weight Gain During Pregnancy', 'Pancreatic Health', 'Pulmonary Function', 'Cystic Fibrosis Diagnosis', 'Steroid Use History', 'Genetic Testing', 'Neurological Assessments', 'Liver Function Tests', 'Digestive Enzyme Levels', 'Urine Test', 'Birth Weight', 'Early Onset Symptoms']


In [5]:
# Tìm giá trị số trong các cột và chuyển đổi chúng thành kiểu số
numerical_columns = []

for col in df.columns:
    coerced = pd.to_numeric(df[col], errors='coerce')
    valid_ratio = coerced.notna().mean()
    if valid_ratio >= 1.0:
        numerical_columns.append(col)

print("Các cột số tìm được:", numerical_columns)

Các cột số tìm được: ['Insulin Levels', 'Age', 'BMI', 'Blood Pressure', 'Cholesterol Levels', 'Waist Circumference', 'Blood Glucose Levels', 'Weight Gain During Pregnancy', 'Pancreatic Health', 'Pulmonary Function', 'Neurological Assessments', 'Digestive Enzyme Levels', 'Birth Weight']


In [6]:
category_columns = list(set(list_columns) - set(numerical_columns) - set(['Target']))
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[category_columns])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(category_columns))

df_processed = pd.concat([df[numerical_columns], encoded_df], axis=1)
df_processed.astype(float)
df_processed['Target'] = df['Target']
df_processed.head(10)

Unnamed: 0,Insulin Levels,Age,BMI,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,...,Smoking Status_Smoker,History of PCOS_No,History of PCOS_Yes,Cystic Fibrosis Diagnosis_No,Cystic Fibrosis Diagnosis_Yes,Autoantibodies_Negative,Autoantibodies_Positive,Previous Gestational Diabetes_No,Previous Gestational Diabetes_Yes,Target
0,40,44,38,124,201,50,168,18,36,76,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,Steroid-Induced Diabetes
1,13,1,17,73,121,24,178,8,26,60,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,Neonatal Diabetes Mellitus (NDM)
2,27,36,24,121,185,36,105,15,56,80,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,Prediabetic
3,8,7,16,100,151,29,121,12,49,89,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Type 1 Diabetes
4,17,10,17,103,146,33,289,2,10,41,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,Wolfram Syndrome
5,17,41,26,127,208,32,142,11,40,85,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,LADA
6,29,30,31,115,237,43,186,15,62,64,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,Type 2 Diabetes
7,10,3,18,80,157,29,206,4,13,44,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,Wolcott-Rallison Syndrome
8,47,47,25,138,185,40,160,30,91,71,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,Secondary Diabetes
9,21,72,24,136,259,36,192,33,86,69,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Secondary Diabetes


In [None]:
df_processed.to_csv("../../Data/diabetes_dataset_processed.csv", index=False)