In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, balanced_accuracy_score, average_precision_score,
    matthews_corrcoef, cohen_kappa_score, brier_score_loss, roc_auc_score
)
from sklearn.model_selection import train_test_split
random.seed(42)
np.random.seed(42)


In [2]:
# Load dataset
df=pd.read_csv("IndianLiverPatientDataset(ILPD).csv")

In [3]:
def replace_outliers_with_median(df):
    df_out = df.copy()
    for col in df_out.columns:
        if np.issubdtype(df_out[col].dtype, np.number):

            mean = df_out[col].mean()
            std = df_out[col].std()
            if std == 0:  # 
                continue

            z = (df_out[col] - mean) / std
            median = df_out[col].median()

            df_out.loc[np.abs(z) > 3, col] = median
    return df_out

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier







In [5]:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
model = LogisticRegression(max_iter=2000)
# model = KNeighborsClassifier(n_neighbors=5)
# model = SVC(kernel="rbf", probability=True)
# model = GradientBoostingClassifier()


# model = XGBClassifier(
#     n_estimators=300,
#     learning_rate=0.05,
#     max_depth=5,
#     subsample=0.8,
#     colsample_bytree=0.8
# )
# model = LGBMClassifier(n_estimators=300)

# model = CatBoostClassifier(verbose=0)
# model = VotingClassifier(
#     estimators=[
#         ('lr', LogisticRegression(max_iter=2000)),
#         ('rf', RandomForestClassifier(n_estimators=200)),
#         ('svm', SVC(probability=True))
#     ],
#     voting='soft'
# )
# model = StackingClassifier(
#     estimators=[
#         ('lr', LogisticRegression(max_iter=2000)),
#         ('rf', RandomForestClassifier(n_estimators=200))
#     ],
#     final_estimator=LogisticRegression()
# )

# model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000)


## Basic ML

In [6]:
df_cleaned = df.copy()
df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})

df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)

df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())

# Apply outlier replacement
df_cleaned = replace_outliers_with_median(df_cleaned)

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)    

    # --- CLASSIFIER (Random Forest or Voting) ---
    
    model.fit(X_tr_scaled, y_tr)

    y_pred = model.predict(X_val_scaled)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.6809, PREC=0.7317, REC=0.8824, F1=0.8000, AUC=0.5181
Fold 2: ACC=0.7447, PREC=0.7500, REC=0.9706, F1=0.8462, AUC=0.5622
Fold 3: ACC=0.7447, PREC=0.7500, REC=0.9706, F1=0.8462, AUC=0.5622
Fold 4: ACC=0.6596, PREC=0.7073, REC=0.8788, F1=0.7838, AUC=0.5108
Fold 5: ACC=0.7021, PREC=0.7317, REC=0.9091, F1=0.8108, AUC=0.5617
Fold 6: ACC=0.7234, PREC=0.7500, REC=0.9091, F1=0.8219, AUC=0.5974
Fold 7: ACC=0.6739, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5163
Fold 8: ACC=0.7174, PREC=0.7381, REC=0.9394, F1=0.8267, AUC=0.5466
Fold 9: ACC=0.7826, PREC=0.7805, REC=0.9697, F1=0.8649, AUC=0.6387
Fold 10: ACC=0.7609, PREC=0.7500, REC=1.0000, F1=0.8571, AUC=0.5769

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7190
Precision: 0.7414
Recall:    0.9308
F1-score:  0.8252
AUC:       0.5591


## Integrated feautres : PCA, LDA, FA . No Balanced

### PCA

In [7]:
df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)
df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())
df_cleaned = replace_outliers_with_median(df_cleaned)

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- PCA (retain 95% variance) ---
    pca = PCA(n_components=0.95, svd_solver="full")
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    
    
    model.fit(X_tr_pca, y_tr)

    y_pred = model.predict(X_val_pca)

    
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])


fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.7021, PREC=0.7381, REC=0.9118, F1=0.8158, AUC=0.5328
Fold 2: ACC=0.7234, PREC=0.7333, REC=0.9706, F1=0.8354, AUC=0.5238
Fold 3: ACC=0.7660, PREC=0.7556, REC=1.0000, F1=0.8608, AUC=0.5769
Fold 4: ACC=0.6170, PREC=0.6923, REC=0.8182, F1=0.7500, AUC=0.4805
Fold 5: ACC=0.7234, PREC=0.7381, REC=0.9394, F1=0.8267, AUC=0.5768
Fold 6: ACC=0.7447, PREC=0.7561, REC=0.9394, F1=0.8378, AUC=0.6126
Fold 7: ACC=0.6739, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5163
Fold 8: ACC=0.6957, PREC=0.7317, REC=0.9091, F1=0.8108, AUC=0.5315
Fold 9: ACC=0.7609, PREC=0.7750, REC=0.9394, F1=0.8493, AUC=0.6235
Fold 10: ACC=0.7609, PREC=0.7500, REC=1.0000, F1=0.8571, AUC=0.5769

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7168
Precision: 0.7395
Recall:    0.9307
F1-score:  0.8238
AUC:       0.5552


### FA

In [8]:
df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)
df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())
df_cleaned = replace_outliers_with_median(df_cleaned)

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- Factor Analysis (paper uses 7 factors) ---
    fa = FactorAnalysis(n_components=7, random_state=42)
    X_tr_fa = fa.fit_transform(X_tr_scaled)
    X_val_fa = fa.transform(X_val_scaled)

    # --- CLASSIFIER (Random Forest or Voting) ---
    
    model.fit(X_tr_fa, y_tr)

    y_pred = model.predict(X_val_fa)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])


fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.7021, PREC=0.7381, REC=0.9118, F1=0.8158, AUC=0.5328
Fold 2: ACC=0.7447, PREC=0.7500, REC=0.9706, F1=0.8462, AUC=0.5622
Fold 3: ACC=0.7234, PREC=0.7561, REC=0.9118, F1=0.8267, AUC=0.5713
Fold 4: ACC=0.7021, PREC=0.7209, REC=0.9394, F1=0.8158, AUC=0.5411
Fold 5: ACC=0.6809, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5465
Fold 6: ACC=0.6809, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5465
Fold 7: ACC=0.6957, PREC=0.7111, REC=0.9697, F1=0.8205, AUC=0.4848
Fold 8: ACC=0.7609, PREC=0.7619, REC=0.9697, F1=0.8533, AUC=0.6002
Fold 9: ACC=0.7391, PREC=0.7442, REC=0.9697, F1=0.8421, AUC=0.5618
Fold 10: ACC=0.7174, PREC=0.7174, REC=1.0000, F1=0.8354, AUC=0.5000

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7147
Precision: 0.7350
Recall:    0.9400
F1-score:  0.8245
AUC:       0.5447


### LDA

In [9]:
df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)
df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())
df_cleaned = replace_outliers_with_median(df_cleaned)

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []


for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)


    # --- LDA (1 component for binary classification) ---
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_tr_lda = lda.fit_transform(X_tr_scaled, y_tr)
    X_val_lda = lda.transform(X_val_scaled)


    # --- CLASSIFIER (Random Forest or Voting) ---
    
    model.fit(X_tr_lda, y_tr)

    y_pred = model.predict(X_val_lda)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])


fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.6383, PREC=0.7179, REC=0.8235, F1=0.7671, AUC=0.4887
Fold 2: ACC=0.7660, PREC=0.7556, REC=1.0000, F1=0.8608, AUC=0.5769
Fold 3: ACC=0.7234, PREC=0.7442, REC=0.9412, F1=0.8312, AUC=0.5475
Fold 4: ACC=0.6383, PREC=0.7000, REC=0.8485, F1=0.7671, AUC=0.4957
Fold 5: ACC=0.7021, PREC=0.7436, REC=0.8788, F1=0.8056, AUC=0.5823
Fold 6: ACC=0.7447, PREC=0.7561, REC=0.9394, F1=0.8378, AUC=0.6126
Fold 7: ACC=0.6739, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5163
Fold 8: ACC=0.7391, PREC=0.7838, REC=0.8788, F1=0.8286, AUC=0.6317
Fold 9: ACC=0.7174, PREC=0.7500, REC=0.9091, F1=0.8219, AUC=0.5699
Fold 10: ACC=0.7609, PREC=0.7500, REC=1.0000, F1=0.8571, AUC=0.5769

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7104
Precision: 0.7426
Recall:    0.9098
F1-score:  0.8172
AUC:       0.5598


### Integrated features

In [None]:
df_cleaned = df.copy()

df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})
df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)
df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())
df_cleaned = replace_outliers_with_median(df_cleaned)

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- PCA (retain 95% variance) ---
    pca = PCA(n_components=0.95, svd_solver="full")
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    # --- Factor Analysis (paper uses 7 factors) ---
    fa = FactorAnalysis(n_components=7, random_state=42)
    X_tr_fa = fa.fit_transform(X_tr_scaled)
    X_val_fa = fa.transform(X_val_scaled)

    # --- LDA (1 component for binary classification) ---
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_tr_lda = lda.fit_transform(X_tr_scaled, y_tr)
    X_val_lda = lda.transform(X_val_scaled)

    # --- CONCATENATE FEATURES ---
    X_tr_final = np.concatenate([X_tr_pca, X_tr_fa, X_tr_lda], axis=1)
    X_val_final = np.concatenate([X_val_pca, X_val_fa, X_val_lda], axis=1)

    # --- CLASSIFIER (Random Forest or Voting) ---
    
    model.fit(X_tr_final, y_tr)

    y_pred = model.predict(X_val_final)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# =========================
# STEP 4 â€” AVERAGE RESULTS
# =========================
fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.9104 PREC=0.8857 REC=0.9394 F1=0.9118 AUC=0.9109
Fold 2: ACC=0.7910 PREC=0.7879 REC=0.7879 F1=0.7879 AUC=0.7910
Fold 3: ACC=0.8955 PREC=0.9655 REC=0.8235 F1=0.8889 AUC=0.8966
Fold 4: ACC=0.9403 PREC=0.9167 REC=0.9706 F1=0.9429 AUC=0.9398
Fold 5: ACC=0.8358 PREC=0.8966 REC=0.7647 F1=0.8254 AUC=0.8369
Fold 6: ACC=0.9091 PREC=0.9355 REC=0.8788 F1=0.9062 AUC=0.9091
Fold 7: ACC=0.8636 PREC=0.9286 REC=0.7879 F1=0.8525 AUC=0.8636
Fold 8: ACC=0.8788 PREC=0.8571 REC=0.9091 F1=0.8824 AUC=0.8788
Fold 9: ACC=0.8333 PREC=0.8667 REC=0.7879 F1=0.8254 AUC=0.8333
Fold 10: ACC=0.8788 PREC=0.9310 REC=0.8182 F1=0.8710 AUC=0.8788

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.8737
Precision: 0.8971
Recall:    0.8468
F1-score:  0.8694
AUC:       0.8739
