In [None]:
# Decision Tree
# ==============================================================
# Lyrebird Optimization Algorithm (LOA)
# for Feature Selection using Decision Tree on KC2 Dataset
# Includes: Convergence Curve, Error Plot, ROC & AUC Comparison
# ==============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

# ==============================================================
# Load and Prepare Dataset
# ==============================================================

# âœ… Change this path to your dataset location
data = pd.read_csv(r"/content/PC4_csv.csv")

print("âœ… Dataset Loaded Successfully!")
print(f"Shape: {data.shape}")
print(data.head())

# Separate target column
y = data.iloc[:, -1]
X = data.iloc[:, :-1]

# Drop 'id' column if it exists
if 'id' in X.columns:
    X.drop('id', axis=1, inplace=True)

# Normalize data
scaler = preprocessing.MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

Dim = X_train.shape[1]

# ==============================================================
# Define Lyrebird Optimization Algorithm (LOA)
# ==============================================================

def fitness_function(features):
    """Fitness based on Decision Tree classification error"""
    selected = np.where(features == 1)[0]
    if len(selected) == 0:
        return 1.0  # maximum error if no feature selected

    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train.iloc[:, selected], y_train)
    pred = clf.predict(X_test.iloc[:, selected])
    acc = accuracy_score(y_test, pred)
    return 1 - acc  # minimize error

def LOA_feature_selection(X_train, y_train, X_test, y_test, Dim, MaxIter=200, N=30):
    """Lyrebird Optimization Algorithm for Feature Selection"""
    # Initialize population
    X_pop = np.random.randint(0, 2, (N, Dim))
    Fitness = np.zeros(N)

    # Parameters
    alpha, beta, gamma = 0.5, 0.8, 0.9

    # Initialize best
    Best_FF = np.inf
    Best_P = np.zeros(Dim)

    # Evaluate initial population
    for i in range(N):
        Fitness[i] = fitness_function(X_pop[i])
        if Fitness[i] < Best_FF:
            Best_FF = Fitness[i]
            Best_P = X_pop[i].copy()

    Conv_curve = np.zeros(MaxIter)
    Avg_error_curve = np.zeros(MaxIter)

    print("\nðŸš€ LOA Optimization Started...\n")
    for t in range(MaxIter):
        errors = []
        for i in range(N):
            A = alpha * np.random.uniform(-1, 1, Dim)
            B = beta * np.random.uniform(0, 1, Dim)
            C = gamma * np.random.uniform(0, 1)

            new_pos = X_pop[i] + A * (Best_P - B * X_pop[i]) + C * np.random.randn(Dim)
            new_pos = 1 / (1 + np.exp(-new_pos))
            new_pos = np.where(new_pos > 0.5, 1, 0)

            new_fit = fitness_function(new_pos)
            errors.append(new_fit)

            # Greedy selection
            if new_fit < Fitness[i]:
                X_pop[i] = new_pos.copy()
                Fitness[i] = new_fit

            if new_fit < Best_FF:
                Best_FF = new_fit
                Best_P = new_pos.copy()

        Conv_curve[t] = Best_FF
        Avg_error_curve[t] = np.mean(errors)

        if t % 20 == 0 or t == MaxIter - 1:
            print(f"Iteration {t+1}/{MaxIter} -> Best Fitness: {Best_FF:.4f}")

    print("\nâœ… LOA Optimization Completed!")
    return Best_P, Best_FF, Conv_curve, Avg_error_curve

# ==============================================================
# Run LOA for Feature Selection
# ==============================================================

Best_features, Best_error, Conv_curve, Avg_error_curve = LOA_feature_selection(
    X_train, y_train, X_test, y_test, Dim=Dim, MaxIter=150, N=30
)

selected_idx = np.where(Best_features == 1)[0]
print(f"\nðŸŽ¯ Selected {len(selected_idx)} features out of {Dim}:")
print(list(X_train.columns[selected_idx]))

# ==============================================================
# Decision Tree Evaluation: With and Without Feature Selection
# ==============================================================

# Without Feature Selection
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
pred_full = clf_full.predict(X_test)
prob_full = clf_full.predict_proba(X_test)[:, 1]

# With LOA-Selected Features
clf_selected = DecisionTreeClassifier(random_state=42)
clf_selected.fit(X_train.iloc[:, selected_idx], y_train)
pred_sel = clf_selected.predict(X_test.iloc[:, selected_idx])
prob_sel = clf_selected.predict_proba(X_test.iloc[:, selected_idx])[:, 1]

# Metrics
def evaluate_model(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    auc_score = roc_auc_score(y_true, y_prob)
    return acc, prec, rec, f1, auc_score

acc_full, prec_full, rec_full, f1_full, auc_full = evaluate_model(y_test, pred_full, prob_full)
acc_sel, prec_sel, rec_sel, f1_sel, auc_sel = evaluate_model(y_test, pred_sel, prob_sel)

print("\nðŸ“Š Decision Tree Performance Comparison")
print("----------------------------------------------------")
print(f"Without Feature Selection -> Accuracy: {acc_full:.4f}, AUC: {auc_full:.4f}")
print(f"With LOA Feature Selection -> Accuracy: {acc_sel:.4f}, AUC: {auc_sel:.4f}")
print(f"Without Feature Selection -> Prediction: {prec_full:.4f}, PRE:{prec_full:.4f}")
print(f"With LOA Feature Selection -> Prediction : {prec_sel:.4f}, PRE: {prec_sel:.4f}")
print(f"Without Feature Selection -> Recall: {rec_full:.4f}, REC: {rec_full:.4f}")
print(f"With LOA Feature Selection -> Recall: {rec_sel:.4f}, REC: {rec_sel:.4f}")
print(f"Without Feature Selection -> F1-score: {f1_full:.4f}, F1:{f1_full:.4f}")
print(f"With LOA Feature Selection -> F1-score : {f1_sel:.4f}, F1: {f1_sel:.4f}")


# ==============================================================
# Visualization: Convergence, Error Curve, ROC Comparison
# ==============================================================

plt.figure(figsize=(18, 5))

# Convergence Curve
plt.subplot(1, 3, 1)
plt.plot(Conv_curve, 'b-', linewidth=2)
plt.title("Convergence Curve (Best Fitness)")
plt.xlabel("Iteration")
plt.ylabel("Fitness (Error)")
plt.grid(True)

# Average Error Plot
plt.subplot(1, 3, 2)
plt.plot(Avg_error_curve, 'r-', linewidth=2)
plt.title("Average Error per Iteration")
plt.xlabel("Iteration")
plt.ylabel("Average Error")
plt.grid(True)

# ROC Curve Comparison
fpr_full, tpr_full, _ = roc_curve(y_test, prob_full, pos_label='Y')
fpr_sel, tpr_sel, _ = roc_curve(y_test, prob_sel, pos_label='Y')

plt.subplot(1, 3, 3)
plt.plot(fpr_full, tpr_full, 'k--', label=f'Without FS (AUC={auc_full:.4f})')
plt.plot(fpr_sel, tpr_sel, 'g-', label=f'With LOA FS (AUC={auc_sel:.4f})')
plt.plot([0, 1], [0, 1], 'r--')
plt.title("ROC Curve Comparison (Decision Tree)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()