In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

from sklearn.ensemble import ExtraTreesClassifier

# ===============================================================
# FUNCTIONS
# ===============================================================

def log_transform_skewed(df, except_cols):
    skew = df.drop(columns=except_cols).skew()
    skew_cols = skew[abs(skew) > 0.75].index.tolist()

    for col in skew_cols:
        df[col] = np.log1p(df[col])
    return df


def apply_imputation(df):
    imputer = IterativeImputer(random_state=42)
    arr = imputer.fit_transform(df)
    return pd.DataFrame(arr, columns=df.columns)


def balance_data(df, label_col="Sickness"):
    majority = df[df[label_col] == 1]
    minority = df[df[label_col] == 0]

    minority_up = resample(
        minority,
        replace=True,
        n_samples=len(majority),
        random_state=42
    )

    df_bal = pd.concat([majority, minority_up], axis=0)
    return df_bal.sample(frac=1, random_state=42).reset_index(drop=True)


# ===============================================================
# LOAD & CLEAN ILPD
# ===============================================================

df = pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)

# Skew transform
df_cleaned = log_transform_skewed(df_cleaned, ["Sickness", "Gender"])

# Impute missing values
df_cleaned = apply_imputation(df_cleaned)

# Balance dataset
df_cleaned = balance_data(df_cleaned, label_col="Sickness")


# ===============================================================
# TRAINâ€“TEST SPLIT
# ===============================================================

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


# ===============================================================
# MODEL
# ===============================================================

model = ExtraTreesClassifier(
    n_estimators=100,
    criterion="entropy",
    random_state=42
)


# ===============================================================
# STRATIFIED 10-FOLD CV (PCA + FA + LDA)
# ===============================================================

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- PCA (retain 95% variance) ---
    pca = PCA(n_components=0.95, svd_solver="full", random_state=42)
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    # --- Factor Analysis (7 factors) ---
    fa = FactorAnalysis(n_components=7, random_state=42)
    X_tr_fa = fa.fit_transform(X_tr_scaled)
    X_val_fa = fa.transform(X_val_scaled)

    # --- LDA (1 component for binary target) ---
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_tr_lda = lda.fit_transform(X_tr_scaled, y_tr)
    X_val_lda = lda.transform(X_val_scaled)

    # --- CONCATENATE (PCA + FA + LDA) ---
    X_tr_final = np.concatenate([X_tr_pca, X_tr_fa, X_tr_lda], axis=1)
    X_val_final = np.concatenate([X_val_pca, X_val_fa, X_val_lda], axis=1)

    # --- Train classifier ---
    model.fit(X_tr_final, y_tr)

    # --- Predict ---
    y_pred = model.predict(X_val_final)

    # --- Metrics ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f} PREC={prec:.4f} REC={rec:.4f} F1={f1:.4f} AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# ===============================================================
# SHOW MEAN METRICS
# ===============================================================

fold_results = np.array(fold_results)

print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")


Fold 1: ACC=0.9254 PREC=0.9118 REC=0.9394 F1=0.9254 AUC=0.9256
Fold 2: ACC=0.8358 PREC=0.8929 REC=0.7576 F1=0.8197 AUC=0.8347
Fold 3: ACC=0.8358 PREC=0.9259 REC=0.7353 F1=0.8197 AUC=0.8373
Fold 4: ACC=0.8507 PREC=0.9000 REC=0.7941 F1=0.8438 AUC=0.8516
Fold 5: ACC=0.8209 PREC=0.9231 REC=0.7059 F1=0.8000 AUC=0.8226
Fold 6: ACC=0.9091 PREC=0.9355 REC=0.8788 F1=0.9062 AUC=0.9091
Fold 7: ACC=0.8333 PREC=0.8929 REC=0.7576 F1=0.8197 AUC=0.8333
Fold 8: ACC=0.9091 PREC=0.9655 REC=0.8485 F1=0.9032 AUC=0.9091
Fold 9: ACC=0.8182 PREC=0.9200 REC=0.6970 F1=0.7931 AUC=0.8182
Fold 10: ACC=0.8182 PREC=0.9200 REC=0.6970 F1=0.7931 AUC=0.8182

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.8557
Precision: 0.9187
Recall:    0.7811
F1-score:  0.8424
AUC:       0.8560
