In [None]:
import scipy.io as sio
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.utils import resample
from sklearn.model_selection import LeaveOneOut, GridSearchCV
import numpy as np
import plotly.graph_objects as go
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Mounted at /content/drive


In [None]:
def ml_classifier(input_table, target, n_iterations, classifier_type='xgb'):
    """
    Train a classifier (XGBoost, Lasso, ElasticNet, Ridge) using Leave-One-Out cross-validation and obtain test-set results
    via bootstrap.

    Args:
        input_table (pd.DataFrame): Input dataset containing features and class labels.
        target (string): Column to predict.
        n_iterations (int): Number of bootstrap iterations to perform.
        classifier_type (string): Type of classifier to use ('xgb', 'lasso', 'elasticnet', 'ridge').

    Returns:
        dict: Dictionary containing the AUC, FPR, TPR, thresholds, and feature importances for each iteration.

    Classifier Details:
        'xgb': XGBoost is a gradient boosting algorithm.
               Advantages: Handles missing values, non-linear data, and performs well in many scenarios.
               Disadvantages: Might overfit on noisy data, requires careful tuning.

        'lasso': Lasso classification uses Logistic Regression with L1 penalty.
                 Advantages: Performs feature selection by pushing less important features' coefficients to zero.
                 Disadvantages: Can't select more features than samples.

        'elasticnet': ElasticNet classification uses a combination of L1 and L2 penalties.
                      Advantages: Can learn a sparse model where few features are influential, and can select more features than samples.
                      Disadvantages: Requires setting two hyperparameters.

        'ridge': Ridge classification uses Logistic Regression with L2 penalty.
                 Advantages: Prevents multicollinearity in regression model.
                 Disadvantages: Doesn't perform feature selection like Lasso.

    """
    # Bootstrap 90% of the sample size each time
    np.random.seed(42)
    n_size = int(len(input_table) * 0.9)

    # Initialize variables
    stats = list()
    metrics = ['auc', 'fpr', 'tpr', 'thresholds', 'feature_importances']
    results = {'main': {m: [] for m in metrics}}

    # Define hyperparameter grid for C
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

    for i in tqdm(range(n_iterations), desc="Bootstrap iterations"):
        subsampled_data = resample(input_table, n_samples=n_size, stratify=input_table[target].values)
        y = subsampled_data[target].values
        X = subsampled_data.drop(columns=[target]).values

        # Data normalization
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        loo = LeaveOneOut()
        loo.get_n_splits(X)

        labels = []
        probabilities = []
        coefficients = []

        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if classifier_type == 'xgb':
                clf = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
                clf.fit(X_train, y_train)
                coefficients.append(clf.feature_importances_)

            elif classifier_type in ['lasso', 'elasticnet', 'ridge']:
                if classifier_type == 'lasso':
                    base_clf = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced', max_iter=10000, tol=1e-6)
                if classifier_type == 'elasticnet':
                    base_clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced', max_iter=10000, tol=1e-6)
                elif classifier_type == 'ridge':
                    base_clf = LogisticRegression(penalty='l2', solver='sag', class_weight='balanced', max_iter=10000, tol=1e-6)

                # Apply grid search on the base classifier
                grid_clf = GridSearchCV(base_clf, param_grid, cv=3)  # Using 3-fold CV for grid search
                grid_clf.fit(X_train, y_train)
                clf = grid_clf.best_estimator_  # Get the best classifier
                coefficients.append(np.abs(clf.coef_[0]))

            labels.append(y_test)
            probabilities.append(clf.predict_proba(X_test)[:, 1])

        stats.append(roc_auc_score(labels, probabilities))
        fpr, tpr, thresholds = roc_curve(labels, probabilities)
        results['main']['fpr'].append(fpr)
        results['main']['tpr'].append(tpr)
        results['main']['thresholds'].append(thresholds)
        results['main']['auc'].append(roc_auc_score(labels, probabilities))
        results['main']['feature_importances'].append(np.mean(coefficients, axis=0))

    return results

def plot_roc_all_features(results, n_iterations):
    """
    This function creates and plots a ROC for a classifier's performance using all features.

    Args:
    results (dict): A dictionary containing the results of the classifier, including 'fpr', 'tpr', and 'auc' keys.
    n_iterations (int): The number of iterations used for creating the interpolated TPR values.

    Returns:
    None: The function saves the ROC plot as an image file and displays it.
    """
    # Set plot parameters
    colors = {
        'filla': 'rgba(52, 152, 219, 0.2)',
        'linea': 'rgba(52, 152, 219, 0.5)',
        'maina': 'rgba(41, 128, 185, 1.0)',
        'grid': 'rgba(189, 195, 199, 0.5)',
        'annot': 'rgba(149, 165, 166, 0.5)',
        'highlight': 'rgba(192, 57, 43, 1.0)'
    }

    fpr_mean = np.linspace(0, 1, n_iterations)
    interp_tprs = []

    # Calculate confidence bands
    for i in range(n_iterations):
        fpr, tpr = results['main']['fpr'][i], results['main']['tpr'][i]
        interp_tprs.append(np.interp(fpr_mean, fpr, tpr))
        interp_tprs[-1][0] = 0.0

    tpr_mean = np.mean(interp_tprs, axis=0)
    tpr_mean[-1] = 1.0

    tpr_ci = np.std(interp_tprs, axis=0) * 1.96
    tpr_upper = np.clip(tpr_mean + tpr_ci, 0, 1)
    tpr_lower = tpr_mean - tpr_ci

    auc = np.mean(results['main']['auc'])

    plot_data = [
        go.Scatter(x=fpr_mean, y=tpr_upper, line=dict(color=colors['linea'], width=1), hoverinfo="skip", showlegend=False, name='upper'),
        go.Scatter(x=fpr_mean, y=tpr_lower, fill='tonexty', fillcolor=colors['filla'], line=dict(color=colors['linea'], width=1), hoverinfo="skip", showlegend=False, name='lower'),
        go.Scatter(x=fpr_mean, y=tpr_mean, line=dict(color=colors['maina'], width=2), hoverinfo="skip", showlegend=True, name=f'AUC = {auc:.3f} [{tpr_lower.mean():.3f} {tpr_upper.mean():.3f}]')
    ]

    fig = go.Figure(plot_data)
    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

    fig.update_layout(
        template='plotly_white',
        title_x=0.5,
        xaxis_title="1 - Specificity",
        yaxis_title="Sensitivity",
        width=600,
        height=600,
        legend=dict(yanchor="bottom", xanchor="right", x=0.95, y=0.01),
        font=dict(family="Arial", size=22, color="black")
    )

    fig.update_yaxes(range=[0, 1], gridcolor=colors['grid'], scaleanchor="x", scaleratio=1, linecolor='black')
    fig.update_xaxes(range=[0, 1], gridcolor=colors['grid'], constrain='domain', linecolor='black')

    fig.show()

def plot_feature_importances(results, input_table, target):
    """
    Plot the feature importances based on the output of ml_classifier using a horizontal bar plot.

    Args:
        results (dict): The results dictionary from ml_classifier.
        input_table (pd.DataFrame): Input dataset.
        target (string): Target column.

    Returns:
        None: Shows the feature importance plot.
    """

    # Access 'feature_importances' from the nested dictionary structure
    feature_importance_mean = np.mean(results['main']['feature_importances'], axis=0)
    feature_importance_std = np.std(results['main']['feature_importances'], axis=0)

    # Calculate SEM
    n = len(results['main']['feature_importances'])
    sem = feature_importance_std / np.sqrt(n)

    # Get feature names
    feature_names = input_table.drop(columns=[target]).columns.tolist()

    # Match feature name convention from the group classifier
    def rearrange_name(name):
        parts = name.split()
        type_ = parts[1]   # 'wPLI', 'AEC-c'
        feature = parts[2] # 'N2-delta', 'N2-theta'
        return f"{feature}-{type_}"

    # Rearrange names
    feature_names = [rearrange_name(name) for name in feature_names]

    # Sorting the features by importance in ascending order
    sorted_idx = feature_importance_mean.argsort()
    feature_importance_mean = feature_importance_mean[sorted_idx]
    sem = sem[sorted_idx]
    feature_names_sorted = [feature_names[i] for i in sorted_idx]

    # Select the top ten features
    top_n = 10
    feature_importance_mean = feature_importance_mean[-top_n:]
    sem = sem[-top_n:]
    feature_names_sorted = feature_names_sorted[-top_n:]

    # Create a horizontal bar plot
    fig = go.Figure()

    # We use y-axis for feature names and x-axis for feature importance values in horizontal bar plots
    fig.add_trace(
        go.Bar(
            y=feature_names_sorted,
            x=feature_importance_mean,
            orientation='h',
            error_x=dict(type='data', array=sem, visible=True)
        )
    )

    # Layout configuration
    fig.update_layout(
        # title="Feature Importances with SEM",
        xaxis_title="Feature Importance",
        yaxis_title="GE features",
        template='plotly_white',
        width=600,
        height=600,
        font=dict(family="Arial", size=22, color="black")
    )

    fig.show()

In [None]:
### Load MoCA decline data
decline_df = pd.read_excel('./MoCA_change_2.xlsx')
decline_df

Unnamed: 0,ID,Sbj_ID_Date,Categorization,MoCA,MoCA_DurationFollowup_Years,MoCA_Time_Coded_FirstLast,MoCA_Score_FirstVisit,MoCA_Score_LastVisit,MoCA_AnnualChange,MoCA_AnnualChange_Tertile
0,ADEX_048,ADE001_150910,ADNoEp-,25,6.411301,0.0,25,24,-0.155975,2
1,ADEX_048,ADE001_160524,ADNoEp-,21,6.411301,,25,24,-0.155975,2
2,ADEX_048,ADE001_170831,ADNoEp-,25,6.411301,,25,24,-0.155975,2
3,ADEX_048,ADE001_180607,ADNoEp-,25,6.411301,,25,24,-0.155975,2
4,ADEX_048,ADE001_190709,ADNoEp-,24,6.411301,,25,24,-0.155975,2
...,...,...,...,...,...,...,...,...,...,...
97,ADEX_031,MDU_002_180827,ADNoEp-,25,2.025000,,24,23,-0.572826,2
98,ADEX_031,MDU_002_190211,ADNoEp-,24,2.025000,,24,23,-0.572826,2
99,ADEX_031,MDU_002_190826,ADNoEp-,23,2.025000,1.0,24,23,-0.572826,2
100,ADEX_066,MDU_007_170925,ADNoEp+,12,0.750000,0.0,12,8,-5.071220,1


In [None]:
# Remove ID duplicates and unnecessary columns
decline_df = decline_df.drop_duplicates(subset='ID', keep='first')

# Include only ADNoEp- and ADNoEp+, not considering ADEp subjects
decline_df = decline_df[decline_df['Categorization'].isin(['ADNoEp-', 'ADNoEp+'])]

decline_df = decline_df.drop(columns = ['Sbj_ID_Date', 'Categorization',	'MoCA',	'MoCA_DurationFollowup_Years',	'MoCA_Time_Coded_FirstLast',	'MoCA_Score_FirstVisit',	'MoCA_Score_LastVisit',	'MoCA_AnnualChange'], axis = 1)
decline_df

Unnamed: 0,ID,MoCA_AnnualChange_Tertile
0,ADEX_048,2
7,ADEX_018,2
10,ADEX_026,1
15,ADEX_005,3
20,ADEX_042,1
23,ADEX_103,3
28,ADEX_058,3
35,ADEX_107,1
40,ADEX_102,1
43,ADEX_078,2


In [None]:
## Number of subjects to analyze
len(decline_df)

21

In [None]:
# Load the CSV file of the features of interest without NaNs (if just using the AEC-c features)
decline_features = pd.read_csv('./all_GE_AEC_wPLI_features.csv')
decline_features

Unnamed: 0,ID,Class,GE AEC-c N2-delta,GE wPLI N2-delta,GE AEC-c N2-theta,GE wPLI N2-theta,GE AEC-c N2-alpha,GE wPLI N2-alpha,GE AEC-c N2-beta,GE wPLI N2-beta,...,GE AEC-c REM-beta,GE wPLI REM-beta,GE AEC-c REM-gamma,GE wPLI REM-gamma,GE AEC-c Awake-delta,GE wPLI Awake-delta,GE AEC-c Awake-theta,GE wPLI Awake-theta,GE AEC-c Awake-alpha,GE wPLI Awake-alpha
0,ADEX_026,ADNoEp,0.128770,0.210655,0.083121,0.365524,0.079391,0.263201,0.080570,0.143701,...,0.059421,0.130546,0.042475,0.129062,,,,,,
1,ADEX_138,ADNoEp,0.157520,0.225476,0.088631,0.308320,0.085964,0.304171,0.062625,0.112467,...,0.037588,0.143366,0.028960,0.170873,0.267162,0.425184,0.143330,0.431266,0.100214,0.414539
2,ADEX_019,ADNoEp,0.170782,0.212405,0.084221,0.260896,0.091610,0.190766,0.064935,0.120135,...,0.047967,0.151273,0.023069,0.140361,0.098903,0.349207,0.062445,0.381927,0.067062,0.387860
3,ADEX_073,ADNoEp,0.098449,0.139083,0.077275,0.227344,0.100617,0.189084,0.129735,0.158784,...,0.078969,0.169277,0.038257,0.196619,0.205633,0.487886,0.129054,0.460271,0.085699,0.443449
4,ADEX_102,ADNoEp,0.150391,0.181301,0.085158,0.235557,0.093279,0.225444,0.084884,0.130535,...,0.066647,0.150543,0.049337,0.141984,0.078963,0.425201,0.085181,0.427421,0.099795,0.552120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,ADEX_087,HC,0.135703,0.181405,0.105062,0.251805,0.089587,0.212933,0.058670,0.115248,...,0.054493,0.109224,0.034859,0.111385,0.085306,0.329474,0.070009,0.385339,0.069846,0.401191
85,ADEX_101,HC,0.100032,0.194700,0.090840,0.249370,0.104158,0.400345,0.062218,0.132396,...,0.062848,0.171623,0.036318,0.174285,0.154294,0.398091,0.136786,0.433232,0.103932,0.416147
86,ADEX_070,HC,0.141576,0.197467,0.085023,0.212583,0.112732,0.246422,0.099114,0.140721,...,0.108359,0.173262,0.035506,0.197693,0.119004,0.435142,0.064960,0.468404,0.089017,0.443442
87,ADEX_020,HC,0.113023,0.165595,0.083541,0.234830,0.136780,0.194487,0.149657,0.152425,...,0.106216,0.162013,0.104614,0.258386,0.254564,0.320868,0.059974,0.270267,0.107796,0.328306


In [None]:
# Subjects to remove
ids_to_remove = ['ADEX_025', 'ADEX_103', 'ADEX_084', 'ADEX_080',
              'ADEX_079', 'ADEX_068', 'ADEX_048']

# Remove subjects
decline_features = decline_features[~decline_features['ID'].isin(ids_to_remove)]

decline_features

Unnamed: 0,ID,Class,GE AEC-c N2-delta,GE wPLI N2-delta,GE AEC-c N2-theta,GE wPLI N2-theta,GE AEC-c N2-alpha,GE wPLI N2-alpha,GE AEC-c N2-beta,GE wPLI N2-beta,...,GE AEC-c REM-beta,GE wPLI REM-beta,GE AEC-c REM-gamma,GE wPLI REM-gamma,GE AEC-c Awake-delta,GE wPLI Awake-delta,GE AEC-c Awake-theta,GE wPLI Awake-theta,GE AEC-c Awake-alpha,GE wPLI Awake-alpha
0,ADEX_026,ADNoEp,0.128770,0.210655,0.083121,0.365524,0.079391,0.263201,0.080570,0.143701,...,0.059421,0.130546,0.042475,0.129062,,,,,,
1,ADEX_138,ADNoEp,0.157520,0.225476,0.088631,0.308320,0.085964,0.304171,0.062625,0.112467,...,0.037588,0.143366,0.028960,0.170873,0.267162,0.425184,0.143330,0.431266,0.100214,0.414539
2,ADEX_019,ADNoEp,0.170782,0.212405,0.084221,0.260896,0.091610,0.190766,0.064935,0.120135,...,0.047967,0.151273,0.023069,0.140361,0.098903,0.349207,0.062445,0.381927,0.067062,0.387860
3,ADEX_073,ADNoEp,0.098449,0.139083,0.077275,0.227344,0.100617,0.189084,0.129735,0.158784,...,0.078969,0.169277,0.038257,0.196619,0.205633,0.487886,0.129054,0.460271,0.085699,0.443449
4,ADEX_102,ADNoEp,0.150391,0.181301,0.085158,0.235557,0.093279,0.225444,0.084884,0.130535,...,0.066647,0.150543,0.049337,0.141984,0.078963,0.425201,0.085181,0.427421,0.099795,0.552120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,ADEX_087,HC,0.135703,0.181405,0.105062,0.251805,0.089587,0.212933,0.058670,0.115248,...,0.054493,0.109224,0.034859,0.111385,0.085306,0.329474,0.070009,0.385339,0.069846,0.401191
85,ADEX_101,HC,0.100032,0.194700,0.090840,0.249370,0.104158,0.400345,0.062218,0.132396,...,0.062848,0.171623,0.036318,0.174285,0.154294,0.398091,0.136786,0.433232,0.103932,0.416147
86,ADEX_070,HC,0.141576,0.197467,0.085023,0.212583,0.112732,0.246422,0.099114,0.140721,...,0.108359,0.173262,0.035506,0.197693,0.119004,0.435142,0.064960,0.468404,0.089017,0.443442
87,ADEX_020,HC,0.113023,0.165595,0.083541,0.234830,0.136780,0.194487,0.149657,0.152425,...,0.106216,0.162013,0.104614,0.258386,0.254564,0.320868,0.059974,0.270267,0.107796,0.328306


In [None]:
## Merge longitudinal decline information and GE features into a single table
df = pd.merge(decline_df, decline_features, on = 'ID')
df = df.drop(columns = ['ID', 'Class'], axis = 1)
df

Unnamed: 0,MoCA_AnnualChange_Tertile,GE AEC-c N2-delta,GE wPLI N2-delta,GE AEC-c N2-theta,GE wPLI N2-theta,GE AEC-c N2-alpha,GE wPLI N2-alpha,GE AEC-c N2-beta,GE wPLI N2-beta,GE AEC-c N2-gamma,...,GE AEC-c REM-beta,GE wPLI REM-beta,GE AEC-c REM-gamma,GE wPLI REM-gamma,GE AEC-c Awake-delta,GE wPLI Awake-delta,GE AEC-c Awake-theta,GE wPLI Awake-theta,GE AEC-c Awake-alpha,GE wPLI Awake-alpha
0,2,0.20762,0.155513,0.100038,0.221858,0.096494,0.222866,0.06444,0.094509,0.028792,...,,,,,0.098278,0.315736,0.08512,0.298031,0.094722,0.278329
1,1,0.12877,0.210655,0.083121,0.365524,0.079391,0.263201,0.08057,0.143701,0.052161,...,0.059421,0.130546,0.042475,0.129062,,,,,,
2,3,0.227991,0.155559,0.138373,0.190773,0.107806,0.188122,0.068478,0.150932,0.032448,...,0.059307,0.154892,0.026183,0.129226,0.116579,0.291098,0.101767,0.270707,0.098217,0.238577
3,1,0.194273,0.188231,0.09898,0.299543,0.083024,0.237033,0.098423,0.144047,0.035543,...,0.05436,0.134696,0.026734,0.140236,0.107297,0.439166,0.06855,0.391113,0.066035,0.34719
4,3,0.173757,0.158966,0.10073,0.17965,0.102717,0.2457,0.10468,0.144103,0.024937,...,0.070824,0.142433,0.023357,0.137102,0.102202,0.404068,0.074456,0.346571,0.066657,0.314617
5,1,0.215074,0.159961,0.116182,0.292427,0.132091,0.370027,0.115282,0.177892,0.071857,...,0.088837,0.261732,0.080259,0.236546,0.11954,0.435446,0.117992,0.456517,0.121484,0.448574
6,1,0.150391,0.181301,0.085158,0.235557,0.093279,0.225444,0.084884,0.130535,0.060643,...,0.066647,0.150543,0.049337,0.141984,0.078963,0.425201,0.085181,0.427421,0.099795,0.55212
7,2,0.157613,0.147565,0.081649,0.212746,0.086191,0.143866,0.05527,0.104003,0.039948,...,0.053174,0.126396,0.038066,0.125855,0.157087,0.416778,0.083642,0.343416,0.073599,0.406087
8,3,0.126746,0.151326,0.09286,0.242458,0.111497,0.342533,0.10382,0.265208,0.027969,...,0.080984,0.230606,0.045485,0.23206,0.156746,0.403881,0.070278,0.468951,0.067265,0.52087
9,1,0.149688,0.189038,0.086898,0.208071,0.077906,0.194005,0.069564,0.128986,0.032788,...,0.045411,0.156811,0.028432,0.133799,0.206457,0.484646,0.205283,0.54406,0.183624,0.499091


In [None]:
# Leave most affected class as class number 1, and change the rest to class 0
df['MoCA_AnnualChange_Tertile'] = df['MoCA_AnnualChange_Tertile'].replace({2: 0, 3: 0})
df

Unnamed: 0,MoCA_AnnualChange_Tertile,GE AEC-c N2-delta,GE wPLI N2-delta,GE AEC-c N2-theta,GE wPLI N2-theta,GE AEC-c N2-alpha,GE wPLI N2-alpha,GE AEC-c N2-beta,GE wPLI N2-beta,GE AEC-c N2-gamma,...,GE AEC-c REM-beta,GE wPLI REM-beta,GE AEC-c REM-gamma,GE wPLI REM-gamma,GE AEC-c Awake-delta,GE wPLI Awake-delta,GE AEC-c Awake-theta,GE wPLI Awake-theta,GE AEC-c Awake-alpha,GE wPLI Awake-alpha
0,0,0.20762,0.155513,0.100038,0.221858,0.096494,0.222866,0.06444,0.094509,0.028792,...,,,,,0.098278,0.315736,0.08512,0.298031,0.094722,0.278329
1,1,0.12877,0.210655,0.083121,0.365524,0.079391,0.263201,0.08057,0.143701,0.052161,...,0.059421,0.130546,0.042475,0.129062,,,,,,
2,0,0.227991,0.155559,0.138373,0.190773,0.107806,0.188122,0.068478,0.150932,0.032448,...,0.059307,0.154892,0.026183,0.129226,0.116579,0.291098,0.101767,0.270707,0.098217,0.238577
3,1,0.194273,0.188231,0.09898,0.299543,0.083024,0.237033,0.098423,0.144047,0.035543,...,0.05436,0.134696,0.026734,0.140236,0.107297,0.439166,0.06855,0.391113,0.066035,0.34719
4,0,0.173757,0.158966,0.10073,0.17965,0.102717,0.2457,0.10468,0.144103,0.024937,...,0.070824,0.142433,0.023357,0.137102,0.102202,0.404068,0.074456,0.346571,0.066657,0.314617
5,1,0.215074,0.159961,0.116182,0.292427,0.132091,0.370027,0.115282,0.177892,0.071857,...,0.088837,0.261732,0.080259,0.236546,0.11954,0.435446,0.117992,0.456517,0.121484,0.448574
6,1,0.150391,0.181301,0.085158,0.235557,0.093279,0.225444,0.084884,0.130535,0.060643,...,0.066647,0.150543,0.049337,0.141984,0.078963,0.425201,0.085181,0.427421,0.099795,0.55212
7,0,0.157613,0.147565,0.081649,0.212746,0.086191,0.143866,0.05527,0.104003,0.039948,...,0.053174,0.126396,0.038066,0.125855,0.157087,0.416778,0.083642,0.343416,0.073599,0.406087
8,0,0.126746,0.151326,0.09286,0.242458,0.111497,0.342533,0.10382,0.265208,0.027969,...,0.080984,0.230606,0.045485,0.23206,0.156746,0.403881,0.070278,0.468951,0.067265,0.52087
9,1,0.149688,0.189038,0.086898,0.208071,0.077906,0.194005,0.069564,0.128986,0.032788,...,0.045411,0.156811,0.028432,0.133799,0.206457,0.484646,0.205283,0.54406,0.183624,0.499091


In [None]:
# Number of iterations for bootstrap
n_iterations = 1000

results = ml_classifier(df, "MoCA_AnnualChange_Tertile", n_iterations, classifier_type='xgb')

Bootstrap iterations: 100%|██████████| 1000/1000 [06:32<00:00,  2.55it/s]


In [None]:
plot_roc_all_features(results, n_iterations)

In [None]:
plot_feature_importances(results, df, "MoCA_AnnualChange_Tertile")

In [None]:
## Sleep vs Awake Analysis

# Identify columns containing the string "Awake"
awake_columns = [col for col in df.columns if "Awake" in col]

# Create the table with columns containing "Awake" and "MoCA_AnnualChange_Tertile", and drop NaNs
awake_features = df[["MoCA_AnnualChange_Tertile"] + awake_columns].dropna()

# Create the table with columns excluding those containing "Awake", and drop NaNs
# But keeping "MoCA_AnnualChange_Tertile"
non_awake_columns = [col for col in df.columns if col not in awake_columns and col not in ["ID", "MoCA_AnnualChange_Tertile"]]
sleep_features = df[["MoCA_AnnualChange_Tertile"] + non_awake_columns].dropna()

In [None]:
## Run for Sleep
n_iterations = 1000
results = ml_classifier(sleep_features, "MoCA_AnnualChange_Tertile", n_iterations, classifier_type='xgb')
plot_roc_all_features(results, n_iterations)
plot_feature_importances(results, sleep_features, "MoCA_AnnualChange_Tertile")

Bootstrap iterations: 100%|██████████| 1000/1000 [06:01<00:00,  2.76it/s]


In [None]:
## Run for Awake
n_iterations = 1000
results = ml_classifier(awake_features, "MoCA_AnnualChange_Tertile", n_iterations, classifier_type='xgb')
plot_roc_all_features(results, n_iterations)
plot_feature_importances(results, awake_features, "MoCA_AnnualChange_Tertile")

Bootstrap iterations: 100%|██████████| 1000/1000 [06:08<00:00,  2.71it/s]
