In [6]:
# ============================================================
# from kan import KAN
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pennylane as qml
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, balanced_accuracy_score, average_precision_score,
    matthews_corrcoef, cohen_kappa_score, brier_score_loss, roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold

import random
import copy


from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ============================================================
# 1. SEEDING (REPRODUCIBILITY)
# ============================================================
batch_size = 32
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Preprocessing

In [7]:
def log_transform_skewed(df, except_cols):
    skew = df.drop(columns=except_cols).skew()
    skew_cols = skew[abs(skew) > 0.75].index.tolist()

    for col in skew_cols:
        df[col] = np.log1p(df[col])
    return df

def apply_imputation(df):
    imputer = IterativeImputer(random_state=42)
    arr = imputer.fit_transform(df)
    return pd.DataFrame(arr, columns=df.columns)


def balance_data(df, label_col="Sickness"):
    majority = df[df[label_col] == 1]
    minority = df[df[label_col] == 0]

    minority_up = resample(minority,
                           replace=True,
                           n_samples=len(majority),
                           random_state=42)

    df_bal = pd.concat([majority, minority_up], axis=0)
    return df_bal.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
df=pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

In [9]:
df = pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Sickness'] = df['Sickness'].replace(2, 0)

In [10]:
import pandas as pd
import numpy as np

# 18 rows bạn vừa gửi
rows_18 = [
    [18,0,1.8,0.7,178,35,36,6.8,3.6,1.10,1],
    [17,0,0.9,0.2,224,36,45,6.9,4.2,1.55,1],
    [24,0,1.0,0.2,189,52,31,8.0,4.8,1.50,1],
    [60,0,2.2,1.0,271,45,52,6.1,2.9,0.90,0],
    [60,0,0.8,0.2,215,24,17,6.3,3.0,0.90,0],
    [38,1,2.6,1.2,410,59,57,5.6,3.0,0.80,0],
    [35,0,2.0,1.1,226,33,135,6.0,2.7,0.80,0],
    [11,0,0.7,0.1,592,26,29,7.1,4.2,1.40,0],
    [65,0,0.7,0.2,265,30,28,5.2,1.8,0.52,0],
    [36,0,5.3,2.3,145,32,92,5.1,2.6,1.00,0],
    [48,0,0.7,0.2,208,15,30,4.6,2.1,0.80,0],
    [65,0,1.4,0.6,260,28,24,5.2,2.2,0.70,0],
    [62,0,0.6,0.1,160,42,110,4.9,2.6,1.10,0],
    [65,0,0.8,0.2,201,18,22,5.4,2.9,1.10,0],
    [17,1,0.7,0.2,145,18,36,7.2,3.9,1.18,0],
    [62,0,0.7,0.2,162,12,17,8.2,3.2,0.60,0],
    [65,0,1.9,0.8,170,36,43,3.8,1.4,0.58,0],
    [23,1,2.3,0.8,509,28,44,6.9,2.9,0.7,0]  
]

rows_18 = pd.DataFrame(rows_18, columns=df.columns)

# Tìm index trong df gốc
matching_indices = []

for i in range(len(rows_18)):
    mask = (df == rows_18.iloc[i]).all(axis=1)
    idx = df.index[mask].tolist()
    matching_indices.append((i, idx))

matching_indices


[(0, [134]),
 (1, [102]),
 (2, [496]),
 (3, [367]),
 (4, [69]),
 (5, [33, 34]),
 (6, [474]),
 (7, [417]),
 (8, [493]),
 (9, [105, 106]),
 (10, [413]),
 (11, [414]),
 (12, [41]),
 (13, [145]),
 (14, [36]),
 (15, [532]),
 (16, [182]),
 (17, [411])]

In [14]:
df = pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Sickness'] = df['Sickness'].replace(2, 0)
df['A/G'] = df['A/G'].fillna(df['A/G'].mean())

df = log_transform_skewed(df, except_cols=["Sickness", "Gender"])


drop_idx = [134, 102, 496, 367, 33, 34, 474, 417, 493, 105, 106, 413, 414, 145, 36, 532, 182, 411]
df= df.drop([i for i in drop_idx if i in df.index], errors="ignore")
df=df.drop_duplicates().reset_index(drop=True)


train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Sickness"],
    random_state=42
)

# Đoạn này Oversampling, thích thì giữ không thì chỉ cần comment lại là xong. 
# Tránh test được oversampling, val thì không sao.
# train_df = balance_data(train_df)


train_split_df, val_split_df = train_test_split(
    train_df,
    test_size=0.25,                
    stratify=train_df["Sickness"],
    random_state=42
)


X_train_df = train_split_df.drop("Sickness", axis=1)
y_train_df = train_split_df["Sickness"]

X_val_df = val_split_df.drop("Sickness", axis=1)
y_val_df = val_split_df["Sickness"]

X_test_df = test_df.drop("Sickness", axis=1)
y_test_df = test_df["Sickness"]


# ==========================================================
# 5. FIT IMPUTER ONLY ON TRAIN_SPLIT
# ==========================================================
imputer = IterativeImputer(random_state=42)
imputer.fit(X_train_df)               

X_train_imp = imputer.transform(X_train_df)
X_val_imp   = imputer.transform(X_val_df)
X_test_imp  = imputer.transform(X_test_df)


# ==========================================================
# 6. FIT SCALER ONLY ON TRAIN_SPLIT
# ==========================================================
scaler = StandardScaler()
scaler.fit(X_train_imp)

X_train_scaled = scaler.transform(X_train_imp)
X_val_scaled   = scaler.transform(X_val_imp)
X_test_scaled  = scaler.transform(X_test_imp)


# ==========================================================
# 7. FIT PCA, FA, LDA ONLY ON TRAIN_SPLIT
# ==========================================================
pca = PCA(n_components=7, random_state=42)
pca.fit(X_train_scaled)

fa = FactorAnalysis(n_components=7, random_state=42)
fa.fit(X_train_scaled)

lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(X_train_scaled, y_train_df)


# ==========================================================
# 8. Transform all sets
# ==========================================================
X_train_final = np.concatenate([
    pca.transform(X_train_scaled),
    fa.transform(X_train_scaled),
    lda.transform(X_train_scaled)
], axis=1)

X_val_final = np.concatenate([
    pca.transform(X_val_scaled),
    fa.transform(X_val_scaled),
    lda.transform(X_val_scaled)
], axis=1)

X_test_final = np.concatenate([
    pca.transform(X_test_scaled),
    fa.transform(X_test_scaled),
    lda.transform(X_test_scaled)
], axis=1)


# ==========================================================
# 9. Convert to tensors + π/2 scaling
# ==========================================================
X_train = torch.tensor(X_train_final, dtype=torch.float32)
X_val   = torch.tensor(X_val_final,   dtype=torch.float32)
X_test  = torch.tensor(X_test_final,  dtype=torch.float32)

X_train *= (np.pi / 2)
X_val   *= (np.pi / 2)
X_test  *= (np.pi / 2)

y_train = torch.tensor(y_train_df.values, dtype=torch.float32).unsqueeze(1)
y_val   = torch.tensor(y_val_df.values,   dtype=torch.float32).unsqueeze(1)
y_test  = torch.tensor(y_test_df.values,  dtype=torch.float32).unsqueeze(1)


# ==========================================================
# 10. DataLoaders
# ==========================================================
batch_size = 32

train_loader = DataLoader(
    TensorDataset(X_train, y_train),
    batch_size=batch_size,
    shuffle=True
)

val_loader = DataLoader(
    TensorDataset(X_val, y_val),
    batch_size=batch_size,
    shuffle=False
)

test_loader = DataLoader(
    TensorDataset(X_test, y_test),
    batch_size=batch_size,
    shuffle=False
)

print(X_train.size())
print(y_train.size())
print(X_val.size())
print(y_val.size())
print(X_test.size())
print(y_test.size())


torch.Size([332, 15])
torch.Size([332, 1])
torch.Size([111, 15])
torch.Size([111, 1])
torch.Size([111, 15])
torch.Size([111, 1])
