In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [None]:
df = pd.read_csv("../../Data/diabetes_dataset_processed.csv")
df.head(5)

Unnamed: 0,Insulin Levels,Age,BMI,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,...,Physical Activity_Moderate,History of PCOS_No,History of PCOS_Yes,Previous Gestational Diabetes_No,Previous Gestational Diabetes_Yes,Steroid Use History_No,Steroid Use History_Yes,Family History_No,Family History_Yes,Target
0,40,44,38,124,201,50,168,18,36,76,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,Steroid-Induced Diabetes
1,13,1,17,73,121,24,178,8,26,60,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,Neonatal Diabetes Mellitus (NDM)
2,27,36,24,121,185,36,105,15,56,80,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,Prediabetic
3,8,7,16,100,151,29,121,12,49,89,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Type 1 Diabetes
4,17,10,17,103,146,33,289,2,10,41,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,Wolfram Syndrome


In [5]:
le = LabelEncoder()
df['Target'] = le.fit_transform(df['Target'])
X = df.drop(columns=['Target'])
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__bootstrap': [True],
    'classifier__class_weight': ['balanced']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scoring='precision_macro' để không bỏ sót người bệnh
# scoring='recall_macro' để không muốn dự đoán sai người khỏe mạnh là bị bệnh
# scoring='f1_macro' để cân bằng giữa precision và recall
best_model = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

              precision    recall  f1-score   support

           0       1.00      0.87      0.93      1070
           1       0.89      0.90      0.90      1072
           2       0.93      0.95      0.94      1072
           3       0.95      0.84      0.89      1194
           4       1.00      1.00      1.00      1018
           5       0.96      1.00      0.98      1089
           6       0.78      0.77      0.77      1053
           7       0.80      0.82      0.81      1048
           8       0.85      1.00      0.92      1123
           9       0.91      0.68      0.78      1083
          10       0.80      1.00      0.89      1064
          11       0.99      0.84      0.91      1063
          12       0.86      0.99      0.92      1051

    accuracy                           0.90     14000
   macro avg       0.90      0.90      0.89     14000
weighted avg       0.90      0.90      0.89     14000



In [None]:
import joblib

model_path = "../Built_model/randomforest_model.pkl"

joblib.dump(best_model, model_path)

['E:\\Tài liệu học\\2024-2\\Nhập môn Học máy và khai phá dữ liệu\\Machine_Learning_Project\\Model\\Built_model\\randomforest_model.pkl']

In [None]:
disp.plot(cmap='Blues', values_format='d')
plt.xticks(rotation=60, ha='right')
plt.savefig('../../Result&Deploy/Model_result/randomforest_heatmap.png', bbox_inches='tight')  
plt.close()