In [1]:
import os
import numpy as np
import pandas as pd
import random
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import (
    classification_report, precision_score, recall_score, f1_score, roc_auc_score
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTENC

In [2]:
seed_value = 50
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)


In [3]:
data = pd.read_csv('diabetes_prediction_dataset.csv')
data.dropna(inplace=True)
data['gender'] = data['gender'].map({'Male': 1, 'Female': 0})
data = pd.get_dummies(data, columns=['smoking_history'], drop_first=True)
data.fillna(0, inplace=True)

In [5]:
scaler = None
pca = None
def data_preprocessing(X, y, fit):
    # Log1p
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    X = log_transformer.transform(X)
    X = pd.DataFrame(X)
    X.fillna(0, inplace=True)

    # Scale
    global scaler
    if fit:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

    # PCA 99%
    global pca
    if fit:
        pca = PCA(n_components=0.99)
        X = pca.fit_transform(X)
        print(f"Số đặc trưng ban đầu: {X.shape[1]}")
        print(f"Số đặc trưng sau PCA (giữ 99% variance): {pca.n_components_}")
        print(f"Tỉ lệ variance giữ lại: {sum(pca.explained_variance_ratio_):.4f}")
    else:
        X = pca.transform(X)

    return X, y

In [6]:
X = data.drop(['diabetes'], axis=1)
y = data['diabetes']
X_processed, y_processed = data_preprocessing(X, y, fit=True)

Số đặc trưng ban đầu: 11
Số đặc trưng sau PCA (giữ 99% variance): 11
Tỉ lệ variance giữ lại: 0.9924


