In [None]:
import scipy.io as sio
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.utils import resample
from sklearn.model_selection import LeaveOneOut
import numpy as np
import plotly.graph_objects as go
import xgboost as xgb

In [None]:
def ml_classifier(input_table, target, n_iterations):
    """
    Train an XGBoost classifier using Leave-One-Out cross-validation and obtain test-set results
    via bootstrap.

    Args:
        input_table (pd.DataFrame): Input dataset containing features and class labels.
        target (string): Column to predict.
        n_iterations (int): Number of bootstrap iterations to perform.

    Returns:
        dict: Dictionary containing the AUC, FPR, TPR, thresholds, and feature importances for each iteration.
    """
    # Bootstrap 90% of the sample size each time
    np.random.seed(42)
    n_size = int(len(input_table) * 0.9)

    # Initialize variables
    stats = list()
    metrics = ['auc', 'fpr', 'tpr', 'thresholds', 'feature_importances']
    results = {'main': {m: [] for m in metrics}}

    # Loop through subsamples
    for i in tqdm(range(n_iterations), desc="Bootstrap iterations"):
        # Get subsamples
        subsampled_data = resample(input_table, n_samples=n_size, stratify=input_table[target].values)
        y = subsampled_data[target].values
        X = subsampled_data.drop(columns=[target]).values

        loo = LeaveOneOut()
        loo.get_n_splits(X)

        labels = []
        probabilities = []
        importances = []

        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = xgb.XGBClassifier()
            clf.fit(X_train, y_train)

            labels.append(y_test)
            probabilities.append(clf.predict_proba(X_test)[:, 1])
            importances.append(clf.feature_importances_)

        stats.append(roc_auc_score(labels, probabilities))
        fpr, tpr, thresholds = roc_curve(labels, probabilities)
        results['main']['fpr'].append(fpr)
        results['main']['tpr'].append(tpr)
        results['main']['thresholds'].append(thresholds)
        results['main']['auc'].append(roc_auc_score(labels, probabilities))
        results['main']['feature_importances'].append(np.mean(importances, axis=0))

    return results

def plot_roc_all_features(results, n_iterations):
    """
    This function creates and plots a ROC for a classifier's performance using all features.

    Args:
    results (dict): A dictionary containing the results of the classifier, including 'fpr', 'tpr', and 'auc' keys.
    n_iterations (int): The number of iterations used for creating the interpolated TPR values.

    Returns:
    None: The function saves the ROC plot as an image file and displays it.
    """
    # Set plot parameters
    colors = {
        'filla': 'rgba(52, 152, 219, 0.2)',
        'linea': 'rgba(52, 152, 219, 0.5)',
        'maina': 'rgba(41, 128, 185, 1.0)',
        'grid': 'rgba(189, 195, 199, 0.5)',
        'annot': 'rgba(149, 165, 166, 0.5)',
        'highlight': 'rgba(192, 57, 43, 1.0)'
    }

    fpr_mean = np.linspace(0, 1, n_iterations)
    interp_tprs = []

    # Calculate confidence bands
    for i in range(n_iterations):
        fpr, tpr = results['main']['fpr'][i], results['main']['tpr'][i]
        interp_tprs.append(np.interp(fpr_mean, fpr, tpr))
        interp_tprs[-1][0] = 0.0

    tpr_mean = np.mean(interp_tprs, axis=0)
    tpr_mean[-1] = 1.0

    tpr_ci = np.std(interp_tprs, axis=0) * 1.96
    tpr_upper = np.clip(tpr_mean + tpr_ci, 0, 1)
    tpr_lower = tpr_mean - tpr_ci

    auc = np.mean(results['main']['auc'])

    plot_data = [
        go.Scatter(x=fpr_mean, y=tpr_upper, line=dict(color=colors['linea'], width=1), hoverinfo="skip", showlegend=False, name='upper'),
        go.Scatter(x=fpr_mean, y=tpr_lower, fill='tonexty', fillcolor=colors['filla'], line=dict(color=colors['linea'], width=1), hoverinfo="skip", showlegend=False, name='lower'),
        go.Scatter(x=fpr_mean, y=tpr_mean, line=dict(color=colors['maina'], width=2), hoverinfo="skip", showlegend=True, name=f'AUC = {auc:.3f} [{tpr_lower.mean():.3f} {tpr_upper.mean():.3f}]')
    ]

    fig = go.Figure(plot_data)
    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

    fig.update_layout(
        template='plotly_white',
        title_x=0.5,
        xaxis_title="1 - Specificity",
        yaxis_title="Sensitivity",
        width=600,
        height=600,
        legend=dict(yanchor="bottom", xanchor="right", x=0.95, y=0.01),
        font=dict(family="Arial", size=22, color="black")
    )

    fig.update_yaxes(range=[0, 1], gridcolor=colors['grid'], scaleanchor="x", scaleratio=1, linecolor='black')
    fig.update_xaxes(range=[0, 1], gridcolor=colors['grid'], constrain='domain', linecolor='black')

    fig.show()

def plot_feature_importances(results, input_table, target):
    """
    Plot the feature importances based on the output of ml_classifier using a horizontal bar plot.

    Args:
        results (dict): The results dictionary from ml_classifier.
        input_table (pd.DataFrame): Input dataset.
        target (string): Target column.

    Returns:
        None: Shows the feature importance plot.
    """

    # Correctly access 'feature_importances' from the nested dictionary structure
    feature_importance_mean = np.mean(results['main']['feature_importances'], axis=0)
    feature_importance_std = np.std(results['main']['feature_importances'], axis=0)

    # Calculate SEM
    n = len(results['main']['feature_importances'])
    sem = feature_importance_std / np.sqrt(n)

    # Get feature names
    feature_names = input_table.drop(columns=[target]).columns.tolist()

    # Remove "_" characters from feature names
    feature_names = [name.replace("_", "-") for name in feature_names]

    # Sorting the features by importance in ascending order
    sorted_idx = feature_importance_mean.argsort()
    feature_importance_mean = feature_importance_mean[sorted_idx]
    sem = sem[sorted_idx]
    feature_names_sorted = [feature_names[i] for i in sorted_idx]

    # Create a horizontal bar plot
    fig = go.Figure()

    # We use y-axis for feature names and x-axis for feature importance values in horizontal bar plots
    fig.add_trace(
        go.Bar(
            y=feature_names_sorted,
            x=feature_importance_mean,
            orientation='h',
            error_x=dict(type='data', array=sem, visible=True)
        )
    )

    # Layout configuration
    fig.update_layout(
        # title="Feature Importances with SEM",
        xaxis_title="Feature Importance",
        yaxis_title="Features",
        template='plotly_white',
        width=600,
        height=600,
        font=dict(family="Arial", size=22, color="black")
    )

    fig.show()

In [None]:
# Load the CSV file of the features of interest without NaNs
groups_input = pd.read_csv('./Group_features.csv', index_col = 0)
groups_input

In [None]:
# Drop the unnecesary columns
groups_input = groups_input.drop('ID', axis=1)
groups_input

In [None]:
# Select the groups to be compared and change the class names to integers
filtered_df = groups_input.loc[groups_input['Class'].isin(['ADNoEp', 'HC'])].replace({'ADNoEp': 1, 'HC': 0})
filtered_df

In [None]:
# Number of iterations for bootstrap
n_iterations = 1000

results_adnoep_hc = ml_classifier(filtered_df, "Class", n_iterations)

In [None]:
plot_roc_all_features(results_adnoep_hc, n_iterations)

In [None]:
plot_feature_importances(results_adnoep_hc, filtered_df, "Class")

In [None]:
filtered_df = groups_input.loc[groups_input['Class'].isin(['ADNoEp', 'ADEp'])].replace({'ADEp': 1, 'ADNoEp': 0})
filtered_df

In [None]:
# Number of iterations for bootstrap
n_iterations = 1000

results_adnoep_adep = ml_classifier(filtered_df, "Class", n_iterations)

In [None]:
plot_roc_all_features(results_adnoep_adep, n_iterations)

In [None]:
plot_feature_importances(results_adnoep_adep, filtered_df, "Class")