# Script content

- We retrieve the total matrix of standardized data: `total_list_stand`.
- We apply KNN with GridSearch and Cross-Validation (CV) to the matrices.
- We generate a DataFrame with the results for better visualization.
- We take the best hyperparameter combination, train an KNN model with the winning hyperparameters and the training set, and then test it with the test set, also generating the confusion matrix.

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
import math

In [2]:
# Load the saved list from a pickle file
with open("total_list_stand.pkl", "rb") as list_tot_stand:  # Unpickling
    total_list_stand = pickle.load(list_tot_stand)  # Load the data into total_list_stand

Now, lista_total_stand contains 4 lists, each corresponding to one of the preprocessing steps. 

Each of the 4 lists contains 35 sublists, representing the different segments (chunks) we have generated.

Furthermore, each of the 35 sublists contains 19 matrices of size 121x54, one for each EEG channel.

### Cross-Validation KNN

In [None]:
# Define the hyperparameter grid for GridSearchCV
grid = {
    'n_neighbors': [2, 5, 7, 9],  
    'weights': ['uniform', 'distance'],  
    'p': [1, 2],
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}

list_results = []  # To store results from each GridSearch
cv_results = []  # To store cross-validation results

# Loop through all preprocessing types, chunks, and channels
for prep in range(4):  # 4 preprocessing types
    for chunk in range(35):  # 35 chunks
        for chan in range(19):  # 19 channels
            print(f'prep: {prep}, chunk: {chunk}, chan: {chan}')
            
            # Extract features and labels
            df_data = total_list_stand[prep][chunk][chan]
            data_arr = df_data.iloc[:, :-1].to_numpy()  # Features (all but the last column)
            label_arr = df_data['Label'].to_numpy()  # Labels (last column)
            
            # Initialize the KNN model and GridSearchCV
            knn = KNeighborsClassifier()
            clf = GridSearchCV(estimator=knn, param_grid=grid, cv=5, return_train_score=True, verbose=2)
            
            # Fit the model using the full dataset
            clf.fit(data_arr, label_arr)
            
            # Store the model and its CV results
            list_results.append(clf)
            cv_results.append(clf.cv_results_)

In [4]:
list_accuracies = []  # List to store the max accuracy and its corresponding index
cont = 0  # Counter to track the index in cv_results

# Iterate over the cross-validation results
for dic_res in cv_results:
    df = pd.DataFrame(dic_res)  # Convert dictionary of results to DataFrame
    ind_max = df['mean_test_score'].idxmax()  # Get the index of the highest mean test score
    val_max = df.iloc[ind_max]['mean_test_score']  # Get the highest mean test score value
    list_accuracies.append((val_max, ind_max, cont))  # Append accuracy, index, and counter to the list
    cont += 1  # Increment counter

In [5]:
list_best_params = []  # List to store the best hyperparameters from GridSearchCV

# Iterate over the list of GridSearchCV results
for best_p in list_results:
    list_best_params.append(best_p.best_params_)  # Append the best hyperparameters from each model

# Sort the list of accuracies in descending order
list_accuracies.sort(reverse=True)

In [6]:
def conversor_num_mat(num):
    """
    Convert a numerical position into its corresponding preprocessing, chunk, and channel index.
    """
    chan = num % 19  # Calculate channel index (modulo 19)
    n_chunk = math.floor(num / 19)  # Determine chunk index
    chunk = n_chunk % 35  # Calculate chunk (modulo 35)
    n_prep = math.floor(n_chunk / 35)  # Determine preprocessing index
    prep = n_prep % 4  # Calculate preprocessing (modulo 4)
    return prep, chunk, chan

# Retrieve the best cross-validation result (accuracy and index)
dic = cv_results[0]  # Access the first result in cv_results
df = pd.DataFrame(dic)  # Convert to DataFrame
ind_max = df['mean_test_score'].idxmax()  # Get index of max accuracy
val_max = df.iloc[ind_max]['mean_test_score']  # Retrieve the max accuracy

list_good_res = []  # List to store the processed results

# Loop through the list of accuracies
for tup in list_accuracies:
    acc = tup[0]  # Extract the accuracy value
    pos = tup[2]  # Extract the position of the accuracy in the results list
    
    # Convert the position into preprocessing, chunk, and channel
    prep, chunk, chan = conversor_num_mat(pos)
    
    # Retrieve the best hyperparameters for the current result
    dic = list_results[pos].best_params_
    
    # Add additional info to the dictionary
    dic['Preprocessing'] = prep
    dic['Segment'] = chunk
    dic['Channel'] = chan
    dic['Accuracy'] = acc
    
    # Append the dictionary to the results list
    list_good_res.append(dic)

# Convert the results list into a DataFrame for better visualization
df_results_knn = pd.DataFrame(list_good_res)

In [7]:
def conversor_num_to_chan(num):
    """
    Converts a numeric index to its corresponding EEG channel name.
    """
    # Mapping numeric indices to channel names
    channel_map = {
        0: 'Fp1', 1: 'Fp2', 2: 'F3', 3: 'F4', 4: 'C3', 5: 'C4', 
        6: 'P3', 7: 'P4', 8: 'O1', 9: 'O2', 10: 'F7', 11: 'F8',
        12: 'T7', 13: 'T8', 14: 'P7', 15: 'P8', 16: 'Fz', 17: 'Cz', 18: 'Pz'
    }
    
    # Return the corresponding channel name or print an error message
    return channel_map.get(num, 'error de canales')

In [8]:
# Convert channel numbers to their corresponding names using conversor_num_to_chan
df_results_knn['Channel'] = [conversor_num_to_chan(i) for i in df_results_knn['Channel']]

In [None]:
# Display the top 20 rows of the KNN results DataFrame
df_results_knn.head(20)

# Note: These results represent using the entire dataset
# 1 segment, 1 sensor (Channel), all features, using KNN

In [10]:
# Save the KNN results DataFrame to a file using pickle
with open("knn_1channel_df_results", "wb") as f:  # Pickling
    pickle.dump(df_results_knn, f)

In [11]:
with open("knn_1channel_df_results.pkl", "wb") as f:  # Pickling
    pickle.dump(df_results_knn, f)

In [12]:
# Initialize the KNN classifier with specific hyperparameters
clas = KNeighborsClassifier(n_neighbors=5, p=1, weights='uniform', algorithm='auto')

# Select the specific data (preprocessing=0, chunk=26, channel=16)
f_data = total_list_stand[0][12][6]

# Convert the data to NumPy arrays for training
data_arr = f_data.iloc[:, :-1].to_numpy()  # Features (all columns except the last)
label_arr = f_data['Label'].to_numpy()  # Labels (last column)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data_arr, label_arr, 
                                                    train_size=0.8, random_state=124,
                                                    stratify=label_arr)

# Train the KNN model
clas.fit(X_train, y_train)

# Make predictions on the test set
preds = clas.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate the confusion matrix
cm = confusion_matrix(y_test, preds)

# Display the confusion matrix
ConfusionMatrixDisplay(confusion_matrix=cm).plot()