In [4]:
# ============================================================
# from kan import KAN
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pennylane as qml
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, balanced_accuracy_score, average_precision_score,
    matthews_corrcoef, cohen_kappa_score, brier_score_loss, roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold

import random
import copy


from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ============================================================
# 1. SEEDING (REPRODUCIBILITY)
# ============================================================
batch_size = 32
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Preprocessing

In [5]:
def log_transform_skewed(df, except_cols):
    skew = df.drop(columns=except_cols).skew()
    skew_cols = skew[abs(skew) > 0.75].index.tolist()

    for col in skew_cols:
        df[col] = np.log1p(df[col])
    return df

def apply_imputation(df):
    imputer = IterativeImputer(random_state=42)
    arr = imputer.fit_transform(df)
    return pd.DataFrame(arr, columns=df.columns)


def balance_data(df, label_col="Sickness"):
    majority = df[df[label_col] == 1]
    minority = df[df[label_col] == 0]

    minority_up = resample(minority,
                           replace=True,
                           n_samples=len(majority),
                           random_state=42)

    df_bal = pd.concat([majority, minority_up], axis=0)
    return df_bal.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
df=pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

In [7]:
df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)

df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())


# Transform skewed excluding label + gender
df_cleaned = log_transform_skewed(df_cleaned, ["Sickness", "Gender"])

# Imputation
df_cleaned = apply_imputation(df_cleaned)

# Balance dataset
df_cleaned = balance_data(df_cleaned, label_col="Sickness")

# ===============================================================
# TRAIN–TEST SPLIT
# ===============================================================

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train_df = train.drop("Sickness", axis=1)
y_train_df = train["Sickness"]
X_test_df = test.drop("Sickness", axis=1)
y_test_df = test["Sickness"]

shuffle_idx = np.random.permutation(len(X_train_df))
X_train_df = X_train_df.iloc[shuffle_idx]
y_train_df = y_train_df.iloc[shuffle_idx]

# ===============================================================
# STEP 2: CUT 25% FOR VALIDATION (giống Keras validation_split)
# ===============================================================

dataset_size = len(X_train_df)
val_size = int(0.25 * dataset_size)
train_size = dataset_size - val_size

X_train_split_df = X_train_df.iloc[:train_size]
y_train_split_df = y_train_df.iloc[:train_size]

X_val_split_df = X_train_df.iloc[train_size:]
y_val_split_df = y_train_df.iloc[train_size:]


# ===============================================================
# STEP 3: FIT PREPROCESSING ONLY ON TRAIN_SPLIT
# ===============================================================

# --- Standard Scaler ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split_df)
X_val_scaled   = scaler.transform(X_val_split_df)
X_test_scaled  = scaler.transform(X_test_df)

# --- PCA ---
pca = PCA(n_components=7)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca   = pca.transform(X_val_scaled)
X_test_pca  = pca.transform(X_test_scaled)

# --- Factor Analysis ---
fa = FactorAnalysis(n_components=7)
X_train_fa = fa.fit_transform(X_train_scaled)
X_val_fa   = fa.transform(X_val_scaled)
X_test_fa  = fa.transform(X_test_scaled)

# --- LDA ---
lda = LinearDiscriminantAnalysis(n_components=1)
X_train_lda = lda.fit_transform(X_train_scaled, y_train_split_df)
X_val_lda   = lda.transform(X_val_scaled)
X_test_lda  = lda.transform(X_test_scaled)


# ===============================================================
# STEP 4: CONCATENATE PCA + FA + LDA
# ===============================================================

X_train_final = np.concatenate([X_train_pca, X_train_fa, X_train_lda], axis=1)
X_val_final   = np.concatenate([X_val_pca,   X_val_fa,   X_val_lda],   axis=1)
X_test_final  = np.concatenate([X_test_pca,  X_test_fa,  X_test_lda],  axis=1)


# ===============================================================
# STEP 5: CONVERT TO TORCH TENSORS
# ===============================================================

X_train = torch.tensor(X_train_final, dtype=torch.float32)
X_val   = torch.tensor(X_val_final,   dtype=torch.float32)
X_test  = torch.tensor(X_test_final,  dtype=torch.float32)

# Scale to π/2 (giống code của bạn)
X_train *= (np.pi / 2)
X_val   *= (np.pi / 2)
X_test  *= (np.pi / 2)

y_train = torch.tensor(y_train_split_df.values, dtype=torch.float32).unsqueeze(1)
y_val   = torch.tensor(y_val_split_df.values,   dtype=torch.float32).unsqueeze(1)
y_test  = torch.tensor(y_test_df.values,        dtype=torch.float32).unsqueeze(1)


# ===============================================================
# STEP 6: DATALOADERS
# ===============================================================

batch_size = 32

train_loader = DataLoader(
    TensorDataset(X_train, y_train),
    batch_size=batch_size,
    shuffle=True
)

val_loader = DataLoader(
    TensorDataset(X_val, y_val),
    batch_size=batch_size,
    shuffle=False
)

test_loader = DataLoader(
    TensorDataset(X_test, y_test),
    batch_size=batch_size,
    shuffle=False
)