In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, balanced_accuracy_score, average_precision_score,
    matthews_corrcoef, cohen_kappa_score, brier_score_loss, roc_auc_score
)
random.seed(42)
np.random.seed(42)


In [2]:
# Load dataset
df=pd.read_csv("../data/Indian Liver Patient Dataset (ILPD).csv")

## Basic ML

In [None]:
df_cleaned = df.copy()
df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})

df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)

df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

# Reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Separate features and target for train and test sets
X_train = train.drop('Sickness', axis=1)
y_train = train['Sickness']
X_test = test.drop('Sickness', axis=1)
y_test = test['Sickness']

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 10-Fold Cross-Validation on training data
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), 1):
    X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    rf_model.fit(X_fold_train, y_fold_train)
    y_pred = rf_model.predict(X_fold_val)
    
    acc = accuracy_score(y_fold_val, y_pred)
    cv_scores.append(acc)
    print(f"Fold {fold}: Accuracy = {acc:.4f}")

print(f"\nMean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

# Train final model on entire training set
rf_model.fit(X_train_scaled, y_train)

# Evaluate on test set
y_test_pred = rf_model.predict(X_test_scaled)

print("\n=== Test Set Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_test_pred):.4f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_test_pred):.4f}")
print(f"ROC:{roc_auc_score(y_test, y_test_pred):.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")


Fold 1: Accuracy = 0.5957
Fold 2: Accuracy = 0.7660
Fold 3: Accuracy = 0.7447
Fold 4: Accuracy = 0.7021
Fold 5: Accuracy = 0.7447
Fold 6: Accuracy = 0.7872
Fold 7: Accuracy = 0.7174
Fold 8: Accuracy = 0.7174
Fold 9: Accuracy = 0.6739
Fold 10: Accuracy = 0.6957

Mean CV Accuracy: 0.7145 (+/- 0.0510)

=== Test Set Performance ===
Accuracy: 0.7350
Precision: 0.7653
Recall: 0.9036
F1-Score: 0.8287
Balanced Accuracy: 0.6136
ROC:0.6136

Confusion Matrix:
[[11 23]
 [ 8 75]]


## Integrated feautres : PCA, LDA, FA . No Balanced

In [9]:
# Load dataset
df=pd.read_csv("../data/Indian Liver Patient Dataset (ILPD).csv")

In [10]:
df_cleaned = df.copy()
df_cleaned['Gender'] = df_cleaned['Gender'].map({'Male': 0, 'Female': 1})

df_cleaned['Sickness'] = df_cleaned['Sickness'].replace(2, 0)

df_cleaned['A/G'] = df_cleaned['A/G'].fillna(df_cleaned['A/G'].mean())

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

# Reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### PCA

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# =========================
# STEP 0 — CLEANING
# =========================

def replace_outliers_with_median(df):
    df_out = df.copy()
    for col in df_out.columns:
        if np.issubdtype(df_out[col].dtype, np.number):

            mean = df_out[col].mean()
            std = df_out[col].std()
            if std == 0:  # avoid division by zero
                continue

            z = (df_out[col] - mean) / std
            median = df_out[col].median()

            df_out.loc[np.abs(z) > 3, col] = median
    return df_out

# Apply outlier replacement
df_cleaned = replace_outliers_with_median(df_cleaned)

# =========================
# STEP 1 — TRAIN TEST SPLIT
# =========================
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]

# =========================
# STEP 2 — CV SETUP
# =========================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

# =========================
# STEP 3 — CROSS VALIDATION
# =========================
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- PCA (retain 95% variance) ---
    pca = PCA(n_components=0.95, svd_solver="full")
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    

    # --- CLASSIFIER (Random Forest or Voting) ---
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr_pca, y_tr)

    y_pred = clf.predict(X_val_pca)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# =========================
# STEP 4 — AVERAGE RESULTS
# =========================
fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Mean CV Accuracy: 0.6630434782608696

=== Test Set Performance ===
Accuracy: 0.7179
Precision: 0.7604
Recall: 0.8795
F1-Score: 0.8156
Balanced Accuracy: 0.6015
ROC:0.6015

Confusion Matrix:
[[11 23]
 [10 73]]


### FA

In [31]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# =========================
# STEP 0 — CLEANING
# =========================

def replace_outliers_with_median(df):
    df_out = df.copy()
    for col in df_out.columns:
        if np.issubdtype(df_out[col].dtype, np.number):

            mean = df_out[col].mean()
            std = df_out[col].std()
            if std == 0:  # avoid division by zero
                continue

            z = (df_out[col] - mean) / std
            median = df_out[col].median()

            df_out.loc[np.abs(z) > 3, col] = median
    return df_out

# Apply outlier replacement
df_cleaned = replace_outliers_with_median(df_cleaned)

# =========================
# STEP 1 — TRAIN TEST SPLIT
# =========================
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]

# =========================
# STEP 2 — CV SETUP
# =========================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

# =========================
# STEP 3 — CROSS VALIDATION
# =========================
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- Factor Analysis (paper uses 7 factors) ---
    fa = FactorAnalysis(n_components=7, random_state=42)
    X_tr_fa = fa.fit_transform(X_tr_scaled)
    X_val_fa = fa.transform(X_val_scaled)

    # --- CLASSIFIER (Random Forest or Voting) ---
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr_fa, y_tr)

    y_pred = clf.predict(X_val_fa)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# =========================
# STEP 4 — AVERAGE RESULTS
# =========================
fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.7447, PREC=0.7619, REC=0.9412, F1=0.8421, AUC=0.5860
Fold 2: ACC=0.7234, PREC=0.7561, REC=0.9118, F1=0.8267, AUC=0.5713
Fold 3: ACC=0.7447, PREC=0.7619, REC=0.9412, F1=0.8421, AUC=0.5860
Fold 4: ACC=0.5957, PREC=0.6750, REC=0.8182, F1=0.7397, AUC=0.4448
Fold 5: ACC=0.6809, PREC=0.7143, REC=0.9091, F1=0.8000, AUC=0.5260
Fold 6: ACC=0.7660, PREC=0.8056, REC=0.8788, F1=0.8406, AUC=0.6894
Fold 7: ACC=0.6739, PREC=0.7368, REC=0.8485, F1=0.7887, AUC=0.5396
Fold 8: ACC=0.7391, PREC=0.7838, REC=0.8788, F1=0.8286, AUC=0.6317
Fold 9: ACC=0.7609, PREC=0.7619, REC=0.9697, F1=0.8533, AUC=0.6002
Fold 10: ACC=0.7609, PREC=0.7619, REC=0.9697, F1=0.8533, AUC=0.6002

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7190
Precision: 0.7519
Recall:    0.9067
F1-score:  0.8215
AUC:       0.5775


### LDA

In [29]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# =========================
# STEP 0 — CLEANING
# =========================

def replace_outliers_with_median(df):
    df_out = df.copy()
    for col in df_out.columns:
        if np.issubdtype(df_out[col].dtype, np.number):

            mean = df_out[col].mean()
            std = df_out[col].std()
            if std == 0:  # avoid division by zero
                continue

            z = (df_out[col] - mean) / std
            median = df_out[col].median()

            df_out.loc[np.abs(z) > 3, col] = median
    return df_out

# Apply outlier replacement
df_cleaned = replace_outliers_with_median(df_cleaned)

# =========================
# STEP 1 — TRAIN TEST SPLIT
# =========================
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]

# =========================
# STEP 2 — CV SETUP
# =========================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

# =========================
# STEP 3 — CROSS VALIDATION
# =========================
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)


    # --- LDA (1 component for binary classification) ---
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_tr_lda = lda.fit_transform(X_tr_scaled, y_tr)
    X_val_lda = lda.transform(X_val_scaled)


    # --- CLASSIFIER (Random Forest or Voting) ---
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr_lda, y_tr)

    y_pred = clf.predict(X_val_lda)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# =========================
# STEP 4 — AVERAGE RESULTS
# =========================
fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.7021, PREC=0.7941, REC=0.7941, F1=0.7941, AUC=0.6278
Fold 2: ACC=0.6809, PREC=0.7568, REC=0.8235, F1=0.7887, AUC=0.5656
Fold 3: ACC=0.6170, PREC=0.7500, REC=0.7059, F1=0.7273, AUC=0.5452
Fold 4: ACC=0.5532, PREC=0.7143, REC=0.6061, F1=0.6557, AUC=0.5173
Fold 5: ACC=0.5745, PREC=0.6857, REC=0.7273, F1=0.7059, AUC=0.4708
Fold 6: ACC=0.6596, PREC=0.7576, REC=0.7576, F1=0.7576, AUC=0.5931
Fold 7: ACC=0.6739, PREC=0.7812, REC=0.7576, F1=0.7692, AUC=0.6096
Fold 8: ACC=0.6087, PREC=0.7419, REC=0.6970, F1=0.7188, AUC=0.5408
Fold 9: ACC=0.7174, PREC=0.8125, REC=0.7879, F1=0.8000, AUC=0.6632
Fold 10: ACC=0.6304, PREC=0.7353, REC=0.7576, F1=0.7463, AUC=0.5326

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.6418
Precision: 0.7529
Recall:    0.7414
F1-score:  0.7464
AUC:       0.5666


### Integrated features

In [30]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# =========================
# STEP 0 — CLEANING
# =========================

def replace_outliers_with_median(df):
    df_out = df.copy()
    for col in df_out.columns:
        if np.issubdtype(df_out[col].dtype, np.number):

            mean = df_out[col].mean()
            std = df_out[col].std()
            if std == 0:  # avoid division by zero
                continue

            z = (df_out[col] - mean) / std
            median = df_out[col].median()

            df_out.loc[np.abs(z) > 3, col] = median
    return df_out

# Apply outlier replacement
df_cleaned = replace_outliers_with_median(df_cleaned)

# =========================
# STEP 1 — TRAIN TEST SPLIT
# =========================
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=df_cleaned["Sickness"],
    random_state=42
)

X_train = train.drop("Sickness", axis=1)
y_train = train["Sickness"]
X_test = test.drop("Sickness", axis=1)
y_test = test["Sickness"]

# =========================
# STEP 2 — CV SETUP
# =========================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_results = []

# =========================
# STEP 3 — CROSS VALIDATION
# =========================
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    X_tr = X_train.iloc[tr_idx]
    y_tr = y_train.iloc[tr_idx]

    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    # --- Standard scaling ---
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # --- PCA (retain 95% variance) ---
    pca = PCA(n_components=0.95, svd_solver="full")
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    # --- Factor Analysis (paper uses 7 factors) ---
    fa = FactorAnalysis(n_components=7, random_state=42)
    X_tr_fa = fa.fit_transform(X_tr_scaled)
    X_val_fa = fa.transform(X_val_scaled)

    # --- LDA (1 component for binary classification) ---
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_tr_lda = lda.fit_transform(X_tr_scaled, y_tr)
    X_val_lda = lda.transform(X_val_scaled)

    # --- CONCATENATE FEATURES ---
    X_tr_final = np.concatenate([X_tr_pca, X_tr_fa, X_tr_lda], axis=1)
    X_val_final = np.concatenate([X_val_pca, X_val_fa, X_val_lda], axis=1)

    # --- CLASSIFIER (Random Forest or Voting) ---
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr_final, y_tr)

    y_pred = clf.predict(X_val_final)

    # --- METRICS ---
    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    auc  = roc_auc_score(y_val, y_pred)

    print(f"Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

    fold_results.append([acc, prec, rec, f1, auc])

# =========================
# STEP 4 — AVERAGE RESULTS
# =========================
fold_results = np.array(fold_results)
print("\n=== 10-FOLD CV MEAN METRICS ===")
print(f"Accuracy:  {fold_results[:,0].mean():.4f}")
print(f"Precision: {fold_results[:,1].mean():.4f}")
print(f"Recall:    {fold_results[:,2].mean():.4f}")
print(f"F1-score:  {fold_results[:,3].mean():.4f}")
print(f"AUC:       {fold_results[:,4].mean():.4f}")



Fold 1: ACC=0.7234, PREC=0.7692, REC=0.8824, F1=0.8219, AUC=0.5950
Fold 2: ACC=0.7234, PREC=0.7561, REC=0.9118, F1=0.8267, AUC=0.5713
Fold 3: ACC=0.8085, PREC=0.7907, REC=1.0000, F1=0.8831, AUC=0.6538
Fold 4: ACC=0.6809, PREC=0.7250, REC=0.8788, F1=0.7945, AUC=0.5465
Fold 5: ACC=0.7872, PREC=0.7805, REC=0.9697, F1=0.8649, AUC=0.6634
Fold 6: ACC=0.7447, PREC=0.8182, REC=0.8182, F1=0.8182, AUC=0.6948
Fold 7: ACC=0.6957, PREC=0.7568, REC=0.8485, F1=0.8000, AUC=0.5781
Fold 8: ACC=0.6304, PREC=0.7353, REC=0.7576, F1=0.7463, AUC=0.5326
Fold 9: ACC=0.7826, PREC=0.8108, REC=0.9091, F1=0.8571, AUC=0.6853
Fold 10: ACC=0.5435, PREC=0.6579, REC=0.7576, F1=0.7042, AUC=0.3788

=== 10-FOLD CV MEAN METRICS ===
Accuracy:  0.7120
Precision: 0.7600
Recall:    0.8734
F1-score:  0.8117
AUC:       0.5900
