In [None]:
#imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

from helpers.datasetHelper import get_samples, split_healthy_data
from imblearn.over_sampling import SMOTE
from sklearn.metrics import cohen_kappa_score

from helpers.metaheuristics import run_pso_with_progress, run_ga_with_progress
from models import MyXGboost
import numpy as np

In [None]:

directory_path = './datasets'
data_health = get_samples(os.path.join(directory_path, 'DT.Healthy.csv'))

# Load the PAN-CANCER-TRANSPOSED.csv data
healthy_cases, prebrca_cases, cancer_cases = split_healthy_data(data_health)

# Combine the data into a single dataframe
# Tag each list of cases
healthy_cases = pd.DataFrame(healthy_cases)
healthy_cases['Tag'] = 'HEALTHY'
prebrca_cases = pd.DataFrame(prebrca_cases)
prebrca_cases['Tag'] = 'PRE-BRCA'
cancer_cases = pd.DataFrame(cancer_cases)
cancer_cases['Tag'] = 'BRCA'

print("Data loaded successfully.")

df_cancer = pd.concat([healthy_cases, prebrca_cases, cancer_cases], ignore_index=True) #blood samples
X = df_cancer.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')
Y = df_cancer.iloc[:, -1]

feature_names = np.array(data_health[0][:-1])

# Fill missing values with the lowest value of its cpg site
X = X.apply(lambda col: col.fillna(col.min()), axis=0)

n_features = X.shape[1]
print(f"Loaded dataset with {n_features} features and {len(Y)} samples")

# Use DecisionTreeClassifier as the estimator
estimator = MyXGboost.DecisionTreeMultiClass()

In [None]:
# Run PSO
best_weights, best_fitness, progress, X_selected = run_pso_with_progress(
    X, Y, estimator, n_features,
    swarmsize=1000,
    maxiter=100,
    threshold=0.7
)


In [None]:

X_selected_pso = X.iloc[:, X_selected]
selected_feature_names = feature_names[X_selected]

print(f"Done PSO → best fitness = {best_fitness:.4f}")
print(f"Number of selected features: {len(selected_feature_names)}")
print(f"Selected feature indices: {selected_feature_names[:10]}...")  # Show first 10

In [None]:

best_weights_ga, best_fitness_ga, progress_ga, X_selected_proc = run_ga_with_progress(
    X, Y, estimator, X.shape[1], 
    pop_size=5, n_generations=40, threshold=0.9
)

# Convert best_weights_ga to numpy array before comparison
X_selected_ga = X.iloc[:, X_selected_proc]
selected_feature_names_ga = feature_names[X_selected_proc]

print(f"Done GA → best fitness = {best_fitness_ga:.4f}")
print(f"Number of selected features: {len(selected_feature_names_ga)}")
print(f"Selected feature indices: {selected_feature_names_ga[:10]}...")  # Show first 10


In [None]:

# Use LabelEncoder to encode the target classes
label_encoder = LabelEncoder()
# Define the order: Healthy -> Pre-BRCA -> BRCA
ordered_labels = ['HEALTHY', 'PRE-BRCA', 'BRCA']
label_encoder.fit(ordered_labels)
Y_encoded = label_encoder.transform(Y)

print(f"Encoded target classes: {label_encoder.classes_}")

# 1) evaluate with all features
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X, Y_encoded, test_size=0.2, random_state=42
)
# 2) evaluate with selected features
X_train_ga, X_test_ga, y_train_ga, y_test_ga = train_test_split(
    X_selected_ga, Y_encoded, test_size=0.2, random_state=42
)
# 3) evaluate with PSO selected features
X_train_pso, X_test_pso, y_train_pso, y_test_pso = train_test_split(
    X_selected_pso, Y_encoded, test_size=0.2, random_state=42
)

# Apply SMOTE to balance the training instances - ALL
smote = SMOTE(sampling_strategy='auto', random_state=None, k_neighbors=5)
X_train_all, y_train_all = smote.fit_resample(X_train_all, y_train_all)

# Apply SMOTE to balance the training instances - GA
smote = SMOTE(sampling_strategy='auto', random_state=None, k_neighbors=5)
X_train_ga, y_train_ga = smote.fit_resample(X_train_ga, y_train_ga)

# Apply SMOTE to balance the training instances - PSO
smote = SMOTE(sampling_strategy='auto', random_state=None, k_neighbors=5)
X_train_pso, y_train_pso = smote.fit_resample(X_train_pso, y_train_pso)

modes = [
    {
        'Name': 'XGBoost',
        'Model': MyXGboost.XGBoostMultiClass()
    },
    {
        'Name': 'LightGBM',
        'Model': MyXGboost.LightGBMMulticlass()
    },
    {
        'Name': 'RandomForest300',
        'Model': MyXGboost.RandomForest300()
    },
    {
        'Name': 'GradientBoosting',
        'Model': MyXGboost.GradientBoosting()
    }
]

# Print dimensions of different training datasets
print("Training data dimensions:")
print(f"Original data (X_train_all): {X_train_all.shape}")
print(f"GA selected features (X_train_ga): {X_train_ga.shape}")
print(f"PSO selected features (X_train_pso): {X_train_pso.shape}")


In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import plotly.express as px

def display_confusion_matrix(estimator, X_test, y_test):
    # Get predictions
    y_pred = estimator.predict(X_test)
    
    # Create confusion matrix (normalized by true labels)
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    
    # Convert to percentage
    cm = cm * 100
    
    # Create a dataframe for plotting
    classes = label_encoder.classes_
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
    
    fig = px.imshow(
        df_cm,
        text_auto='.1f',
        color_continuous_scale=[
            "#f8bbd0",  # light pink
            "#f06292",  # medium pink
            "#ad1457"   # dark pink
        ],
        aspect="auto",
        labels=dict(x="Predicted Label", y="True Label", color="Percentage"),
        title="Confusion Matrix (Pink Variants)"
    )
    
    fig.show()
    
    # Optional: Return the figure for further customization or saving
    return fig

In [None]:

for feature_set in [('GA', X_train_ga, X_test_ga, y_train_ga, y_test_ga), 
                   ('PSO', X_train_pso, X_test_pso, y_train_pso, y_test_pso),
                   ('All', X_train_all, X_test_all, y_train_all, y_test_all)]:
    
    method, X_train, X_test, y_train, y_test = feature_set
    print(f"\n=== Results for {method} selected features ===")
    
    for m in modes:
        # Split training data into train and validation sets for early stopping
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )   

        selector = m['Model'][0].fit(X_train_split, y_train_split)
        # Use the best model from grid search
        # selector = get_best[0]
    
        # Evaluate the model
        y_pred = selector.predict(X_test)
        y_pred_proba = selector.predict_proba(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        # Handle binary and multiclass cases for ROC AUC
        if y_pred_proba.shape[1] == 2:
            roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
        else:
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

        print(f"\nModel: {m['Name']}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")

        # Compute Kappa index
        kappa = cohen_kappa_score(y_test, y_pred)
        print(f"Kappa index: {kappa:.4f}")

        # Confusion matrix
        display_confusion_matrix(selector, X_test, y_test)
        
        # Feature importance (if available)
        if hasattr(selector, 'feature_importances_'):
            importances = selector.feature_importances_
            indices = np.argsort(importances)[::-1][:20]
            print("\nTop 20 important features:")
            feature_names = selected_feature_names_ga if method == 'GA' else selected_feature_names
            # Choose correct feature names based on method
            if method == 'GA':
                feature_names_to_use = selected_feature_names_ga
            elif method == 'PSO':
                feature_names_to_use = selected_feature_names
            else:
                feature_names_to_use = feature_names

            for rank, idx in enumerate(indices, 1):
                # Only print if idx is within bounds
                if idx < len(feature_names_to_use):
                    print(f"{rank}. {feature_names_to_use[idx]}: {importances[idx]:.4f}")
                else:
                    print(f"{rank}. [Index {idx} out of bounds]")
        else:
            print("\nThis model does not provide feature importances.")
        print("-" * 80)

In [None]:
from sklearn.metrics import roc_curve, auc
import pandas as pd

# ('GA', X_train_ga, X_test_ga, y_train_ga, y_test_ga)
# ('PSO', X_train_pso, X_test_pso, y_train_pso, y_test_pso)
# ('All', X_train_all, X_test_all, y_train_all, y_test_all)

X_train_split = X_train_all
X_test = X_test_all
y_train_split = y_train_all

selector = MyXGboost.LightGBMMulticlass()[0].fit(X_train_split, y_train_split)
# Evaluate the model
y_pred = selector.predict(X_test)
y_pred_proba = selector.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# Handle binary and multiclass cases for ROC AUC
if y_pred_proba.shape[1] == 2:
    roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
else:
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print(f"\nModel: {selector.__class__.__name__}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Compute Kappa index
kappa = cohen_kappa_score(y_test, y_pred)
print(f"Kappa index: {kappa:.4f}")

In [None]:
import numpy as np

# Print class distribution for train and test sets (GA-selected features as example)

def print_class_distribution(y, label_encoder, dataset_name):
    unique, counts = np.unique(y, return_counts=True)
    print(f"\nClass distribution in {dataset_name}:")
    for label, count in zip(label_encoder.inverse_transform(unique), counts):
        print(f"  {label}: {count}")

print_class_distribution(Y_encoded, label_encoder, "Train")
print_class_distribution(y_test_all, label_encoder, "Test")

In [None]:
import plotly.express as px

# Compute ROC curve and ROC area for each class

n_classes = y_pred_proba.shape[1]
fpr = dict()
tpr = dict()
roc_auc_dict = dict()
roc_data = []

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
    roc_auc_dict[i] = auc(fpr[i], tpr[i])
    roc_data.append(pd.DataFrame({
        "fpr": fpr[i],
        "tpr": tpr[i],
        "class": [label_encoder.classes_[i]] * len(fpr[i]),
        "auc": [roc_auc_dict[i]] * len(fpr[i])
    }))

roc_df = pd.concat(roc_data, ignore_index=True)

fig = px.line(
    roc_df,
    x="fpr",
    y="tpr",
    color="class",
    line_dash="class",
    title="ROC Curve (One-vs-Rest, Plotly)",
    labels={"fpr": "False Positive Rate", "tpr": "True Positive Rate"},
    hover_data=["auc"]
)
fig.update_traces(mode='lines+markers', line_shape='spline')
fig.add_shape(
    type="line",
    x0=0, y0=0, x1=1, y1=1,
    line=dict(color="black", dash="dash"),
    name="Random"
)

fig.update_layout(
    legend_title_text="Class",
    width=800,
    height=600
)
fig.show()