In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
df = pd.read_csv('../data/kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [5]:
df['classification'].value_counts()
df['classification'].value_counts(normalize=True)

classification
notckd    0.808975
ckd       0.191025
Name: proportion, dtype: float64

In [6]:
df = df.drop('id', axis=1)

In [7]:
df['classification'] = df['classification'].str.strip()
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

In [8]:
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.replace('\t', '')

In [9]:
df = df.replace(['?', '\t?', ' ?'], np.nan)

In [10]:
df['classification'] = df['classification'].astype(str).str.lower()
df['classification'] = df['classification'].str.strip()
df['classification'] = df['classification'].str.replace('\t', '', regex=False)

print(df['classification'].unique())

['ckd' 'nan' 'notckd']


In [13]:
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in df.select_dtypes(include=[np.number]).columns
                  if col != 'classification']

In [None]:
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [None]:
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

In [None]:
X = df.drop('classification', axis=1)
y = df['classification']

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42,
    stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.1765,       
    random_state=42,
    stratify=y_temp
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

In [None]:
def create_ann_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,), kernel_initializer='he_normal', name='hidden_layer_1'),
        Dense(32, activation='relu', kernel_initializer='he_normal', name='hidden_layer_2'),
        Dense(1, activation='sigmoid', name='output_layer')
    ])
    return model

# Tạo mô hình
model = create_ann_model(X_train_scaled.shape[1])

# Xem kiến trúc model
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall')]
)

In [14]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)
model_checkpoint = ModelCheckpoint(
    '../model/best_kidney_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max'
)
callbacks = [early_stopping, model_checkpoint]

In [None]:
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=32,
    callbacks=callbacks
)

Epoch 1/50
[1m54/75[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.7090 - loss: 0.6159 - precision: 0.8179 - recall: 0.8358



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7831 - loss: 0.5364 - precision: 0.8148 - recall: 0.9528 - val_accuracy: 0.8160 - val_loss: 0.4552 - val_precision: 0.8233 - val_recall: 0.9903
Epoch 2/50
[1m54/75[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8229 - loss: 0.4393 - precision: 0.8337 - recall: 0.9889



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8205 - loss: 0.4485 - precision: 0.8290 - recall: 0.9865 - val_accuracy: 0.8317 - val_loss: 0.4252 - val_precision: 0.8395 - val_recall: 0.9855
Epoch 3/50
[1m62/75[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8387 - loss: 0.3981 - precision: 0.8495 - recall: 0.9851



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8356 - loss: 0.4175 - precision: 0.8425 - recall: 0.9860 - val_accuracy: 0.8356 - val_loss: 0.4119 - val_precision: 0.8444 - val_recall: 0.9831
Epoch 4/50
[1m56/75[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8498 - loss: 0.3626 - precision: 0.8578 - recall: 0.9884



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8424 - loss: 0.3938 - precision: 0.8476 - recall: 0.9875 - val_accuracy: 0.8395 - val_loss: 0.4017 - val_precision: 0.8479 - val_recall: 0.9831
Epoch 5/50
[1m52/75[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8571 - loss: 0.3342 - precision: 0.8616 - recall: 0.9931



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8457 - loss: 0.3737 - precision: 0.8500 - recall: 0.9886 - val_accuracy: 0.8474 - val_loss: 0.3928 - val_precision: 0.8536 - val_recall: 0.9855
Epoch 6/50
[1m67/75[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8573 - loss: 0.3203 - precision: 0.8615 - recall: 0.9925



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8504 - loss: 0.3559 - precision: 0.8532 - recall: 0.9901 - val_accuracy: 0.8513 - val_loss: 0.3865 - val_precision: 0.8556 - val_recall: 0.9879
Epoch 7/50
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8621 - loss: 0.3030 - precision: 0.8657 - recall: 0.9923



[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8550 - loss: 0.3393 - precision: 0.8571 - recall: 0.9907 - val_accuracy: 0.8532 - val_loss: 0.3812 - val_precision: 0.8559 - val_recall: 0.9903
Epoch 8/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8579 - loss: 0.3236 - precision: 0.8598 - recall: 0.9907 - val_accuracy: 0.8493 - val_loss: 0.3758 - val_precision: 0.8553 - val_recall: 0.9855
Epoch 9/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8609 - loss: 0.3083 - precision: 0.8622 - recall: 0.9912 - val_accuracy: 0.8474 - val_loss: 0.3708 - val_precision: 0.8565 - val_recall: 0.9807
Epoch 10/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8655 - loss: 0.2926 - precision: 0.8669 - recall: 0.9907 - val_accuracy: 0.8513 - val_loss: 0.3650 - val_precision: 0.8617 - val_recall: 0.9783
Epoch 11/50
[1m75/75[0m [32m━━━━━━━━━━━

In [None]:
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [None]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(
    X_test_scaled,
    y_test,
    verbose=0
)
test_f1 = f1_score(y_test, y_pred)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
print("Unique labels in y:", sorted(y.unique()))
print("Unique labels in y_test:", sorted(y_test.unique()))

Unique labels in y: [np.int64(0), np.int64(1), np.int64(2)]
Unique labels in y_test: [np.int64(0), np.int64(1), np.int64(2)]
