# Script content

- We retrieve the total matrix of standarized data: `total_list_stand`.
- We generate matrices by pairing EEG channels and concatenate them.
- We apply XGBoost with different hyperparameters (eta, gamma, max_depth) using Cross-Validation (CV).
- We sort and store the results in a DataFrame for better visualization.
- We take the best hyperparameter combination, train an XGBoost model using the training set, and test it on the test set.
- We compute and display the confusion matrix to evaluate the model’s performance.

In [1]:
import numpy as np  
import pandas as pd
import xgboost as xgb
import pickle
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [8]:
# Load the saved list from a pickle file
with open("total_list_stand.pkl", "rb") as list_tot_stand:
    total_list_stand = pickle.load(list_tot_stand)  

Now, `lista_total_stand` contains 4 lists, each corresponding to one of the preprocessing steps.
Each of the 4 lists contains 35 sublists, representing the different segments (chunks) we have generated.
Furthermore, each of the 35 sublists contains 19 matrices of size 121x54, one for each EEG channel.


Now, we need to take these final matrices and combine them in pairs of channels.

In [10]:
# lista_total_stand: Contains the EEG data matrices
# Outer loop for each preprocessing method
preprocess_list = []
for list_prep in total_list_stand:
    segments_list = []
    
    # Loop for each segment
    for segment_list in list_prep:
        matrix_list = []
        
        # Nested loops to combine pairs of matrices (channels)
        for i in range(len(segment_list)):
            for j in range(len(segment_list)):
                if j > i:
                    # Get the label
                    label = segment_list[i]['Label']
                    
                    # Extract data (without labels) from the two matrices
                    df1 = segment_list[i].iloc[:, :-1]
                    df2 = segment_list[j].iloc[:, :-1]
                    
                    # Concatenate the two matrices column-wise
                    df3 = pd.concat([df1, df2], axis=1)
                    
                    # Add the label back to the combined matrix
                    df3['Label'] = label
                    
                    # Append the resulting matrix to the list
                    matrix_list.append(df3)
        
        # Append the matrices for this chunk to the list
        segments_list.append(matrix_list)
    
    # Append the results for this preprocessing method
    preprocess_list.append(segments_list)


Now, in the list `preprocess_list`, we have all the matrices paired by channels.

In [None]:
# List to store results for each hyperparameter combination
results_list = []

# Loop through preprocessing methods, segments, and channel pairs
for prep in range(4):  # 4 preprocessing types
    for segment in range(35):  # 35 segments
        for chan in range(171):  # 171 channel pairs
            for eta in [0.01, 0.1, 0.5, 1]:  # Learning rates
                for gamma in [0, 1]:  # Gamma values
                    for max_depth in [2, 3, 6, 12, 24]:  # Max tree depth
                        print(f'prep: {prep}, segment: {segment}, chan: {chan}, eta: {eta}, gamma: {gamma}, max_depth: {max_depth}')
                        
                        # Retrieve the data for this specific combination
                        df_data = preprocess_list[prep][segment][chan]
                        
                        # Convert to numpy arrays (data and labels)
                        data_dm = df_data.iloc[:, :-1].to_numpy()  # Features
                        label_dm = df_data['Label'].to_numpy()  # Labels
                        
                        # Convert data into XGBoost DMatrix
                        dtrain = xgb.DMatrix(data_dm, label=label_dm)
                        
                        # Define XGBoost parameters
                        param = {"max_depth": max_depth, "eta": eta, "gamma": gamma, "objective": "binary:logistic"}
                        num_round = 30  # Number of boosting rounds
                        k_folds = StratifiedKFold(n_splits=5)  # 5-fold stratified cross-validation
                        
                        # Perform cross-validation
                        res = xgb.cv(
                            param, dtrain, num_round,
                            stratified=True, folds=k_folds,
                            metrics={"error"}, seed=0, verbose_eval=0
                        )
                        
                        # Extract the best result (minimum test error)
                        min_row = res[res['test-error-mean'] == res['test-error-mean'].min()].iloc[0, :]
                        min_error = round(min_row['test-error-mean'], 4)
                        std_error = round(min_row['test-error-std'], 4)
                        
                        # Store the results
                        results_list.append((prep, segment, chan, eta, gamma, max_depth, 1 - min_error, std_error))
                        print('Maximum accuracy obtained:', 1 - min_error, '+-', std_error)


In [5]:
# Sort the results by accuracy (index 6) in descending order
results_list.sort(key=lambda x: x[6], reverse=True)

# Convert the sorted results into a DataFrame
df_results_xgboost = pd.DataFrame(
    results_list, 
    columns=['preprocess', 'segment', 'channel', 'eta', 'gamma', 'max_depth', 'accuracy', 'std_error']
)

In [4]:
def num_to_channels(num):
    """
    Convert a numerical index to a pair of EEG channel names.
    """
    n_channels = 19  # Total number of channels
    total_pairs = n_channels * (n_channels - 1) // 2  # Total unique channel pairs

    if num < 0 or num >= total_pairs:
        return 'Invalid number'

    # Calculate the indices of the channel pair
    count = 0
    for i in range(n_channels - 1):
        num_pairs = n_channels - i - 1
        if num < count + num_pairs:
            j = i + (num - count) + 1
            channel_one = num_to_channel(i)
            channel_two = num_to_channel(j)
            return channel_one, channel_two
        count += num_pairs

    return 'Invalid number'

def num_to_channel(index):
    """
    Map a channel index to its corresponding channel name.
    """
    channel_map = {
        0: 'Fp1', 1: 'Fp2', 2: 'F3', 3: 'F4', 4: 'C3', 5: 'C4',
        6: 'P3', 7: 'P4', 8: 'O1', 9: 'O2', 10: 'F7', 11: 'F8',
        12: 'T7', 13: 'T8', 14: 'P7', 15: 'P8', 16: 'Fz', 17: 'Cz', 18: 'Pz'
    }
    return channel_map.get(index, 'Unknown channel')


In [7]:
# Convert the channel numbers to pairs of channel names using num_to_channels
list_channel = list(df_results_xgboost['channel'])  # Extract the 'channel' column as a list
list_pairs = [num_to_channels(i) for i in list_channel]  # Convert each numerical index to a pair of channels
df_results_xgboost['channel'] = list_pairs  # Update the 'channel' column with the channel pairs

In [None]:
# Display the first 50 rows of the DataFrame
df_results_xgboost.head(50)

In [2]:
# # Save the DataFrame df_results_xgboost to a pickle file
# with open("xgboost_2channels_df_results.pkl", "wb") as fp:
#     pickle.dump(df_results_xgboost, fp)

# # Load the DataFrame df_results_xgboost from the pickle file
# with open("xgboost_2channels_df_results.pkl", "rb") as df_res_xgboost:
#     df_results_xgboost = pickle.load(df_res_xgboost)

In [None]:
# 61 is the number of the list of pairs of channels that corresponds to (F4, P7)
num_to_channels(61)

In [14]:
# Select the specific data (preprocessing=0, segment=31, channel=61)
df_best_xgb = preprocess_list[0][31][61]

# Extract features (data) and labels from the DataFrame
data = df_best_xgb.iloc[:, :-1].to_numpy()  # Features (all columns except the last)
label = df_best_xgb['Label'].to_numpy()  # Labels (last column)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data, label, train_size=0.8, random_state=4, stratify=label)

# Create an instance of the XGBoost best classifier with specified hyperparameters
bst = XGBClassifier(max_depth=2, eta=0.1, gamma=0, objective='binary:logistic')

# Train the model on the training data
bst.fit(X_train, y_train)

# Make predictions on the test set
preds = bst.predict(X_test)

In [None]:
# Generate the confusion matrix based on true labels and predictions
cm = confusion_matrix(y_test, preds)

# Plot and display the confusion matrix
ConfusionMatrixDisplay(confusion_matrix=cm).plot()