In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [2]:
df = pd.read_csv("../../Data/diabetes_dataset_processed.csv")
df.head(5)

Unnamed: 0,Insulin Levels,Age,BMI,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,...,Physical Activity_Moderate,History of PCOS_No,History of PCOS_Yes,Previous Gestational Diabetes_No,Previous Gestational Diabetes_Yes,Steroid Use History_No,Steroid Use History_Yes,Family History_No,Family History_Yes,Target
0,40,44,38,124,201,50,168,18,36,76,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,Steroid-Induced Diabetes
1,13,1,17,73,121,24,178,8,26,60,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,Neonatal Diabetes Mellitus (NDM)
2,27,36,24,121,185,36,105,15,56,80,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,Prediabetic
3,8,7,16,100,151,29,121,12,49,89,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Type 1 Diabetes
4,17,10,17,103,146,33,289,2,10,41,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,Wolfram Syndrome


In [3]:
le = LabelEncoder()
df['Target'] = le.fit_transform(df['Target'])
X = df.drop(columns=['Target'])
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

param_grid = {
    'classifier__n_neighbors': [500, 1000, 2000],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [2],
    'classifier__algorithm': ['auto'],
    'classifier__leaf_size': [20, 30]
}
 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scoring='precision_macro' để không bỏ sót người bệnh
# scoring='recall_macro' để không muốn dự đoán sai người khỏe mạnh là bị bệnh
# scoring='f1_macro' để cân bằng giữa precision và recall
best_model = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

              precision    recall  f1-score   support

           0       0.93      0.44      0.60      1087
           1       0.64      0.31      0.42      1089
           2       0.83      0.47      0.60      1055
           3       0.53      0.77      0.63      1121
           4       0.97      0.99      0.98      1091
           5       0.46      0.91      0.61      1089
           6       0.67      0.36      0.47      1104
           7       0.66      0.21      0.32      1047
           8       0.69      0.84      0.76      1046
           9       0.80      0.50      0.61      1067
          10       0.41      0.91      0.56      1097
          11       0.64      0.96      0.76      1054
          12       0.98      0.38      0.55       920

    accuracy                           0.62     13867
   macro avg       0.71      0.62      0.61     13867
weighted avg       0.70      0.62      0.61     13867



In [18]:
import joblib

model_path = "../Built_model/knn_model.pkl"

joblib.dump(best_model, model_path)

['../Built_model/knn_model.pkl']

In [19]:
disp.plot(cmap='Blues', values_format='d')
plt.xticks(rotation=60, ha='right')
plt.savefig("../../Result&Deploy/Model_result/knn_heatmap.png", bbox_inches='tight')  
plt.close()

In [20]:
best_model.best_params_

{'classifier__algorithm': 'auto',
 'classifier__leaf_size': 20,
 'classifier__n_neighbors': 2000,
 'classifier__p': 2,
 'classifier__weights': 'distance'}