## Train model for Clusters

In [1]:
model_name = 'DWTs'

### Import libraries

In [2]:
import numpy as np
from numpy.matlib import repmat
import scipy
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import mutual_info_classif


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, log_loss
from sklearn.model_selection import cross_validate

from matplotlib.colors import LinearSegmentedColormap
from mne import create_info
from mne.viz import plot_topomap

### Import data

In [3]:
filename = '/mnt/d/sharegit/MasterThesis_data/DWTs/pp01_t16.csv'
df = pd.read_csv(filename, header=None)
filename = '/mnt/d/sharegit/MasterThesis_data/DWTs/clustered_dataframe.csv'
df_scores = pd.read_csv(filename)
clusters = list(df_scores['Cluster'])

In [4]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

channels = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T7', 'T8', 'P7', 'P8', 'Fz', 'Cz', 'Pz', 'AFz', 'CPz', 'POz']
patterns = ['_d1', '_d2', '_d3', '_d4', '_d5', '_a1']
column_names = ['channels', 'ID', 'Class', 'Epoch'] + [f'{ch}{pt}' for ch in channels for pt in patterns ]

if len(column_names) != len(df.columns):
    raise ValueError(f"Number of column names ({len(column_names)}) does not match number of columns in DataFrame ({len(df.columns)})")

df.columns = column_names
df.head()

Unnamed: 0,channels,ID,Class,Epoch,Fp1_d1,Fp1_d2,Fp1_d3,Fp1_d4,Fp1_d5,Fp1_a1,...,CPz_d3,CPz_d4,CPz_d5,CPz_a1,POz_d1,POz_d2,POz_d3,POz_d4,POz_d5,POz_a1
0,5,1,1,1,0.575075,0.136032,0.123487,0.09356,0.057901,0.013945,...,0.140594,0.042888,0.025799,0.006609,0.336483,0.220715,0.231677,0.101612,0.089528,0.019985
1,5,1,1,2,0.509184,0.262941,0.106593,0.072962,0.037227,0.011093,...,0.163608,0.041396,0.009745,0.001428,0.244889,0.384025,0.221724,0.070184,0.060243,0.018934
2,5,1,1,3,0.49034,0.191802,0.204283,0.070682,0.033491,0.009402,...,0.218914,0.044268,0.010597,0.001778,0.273779,0.215197,0.39548,0.070791,0.03308,0.011673
3,5,1,1,4,0.436777,0.235945,0.218578,0.060795,0.038248,0.009657,...,0.273868,0.036003,0.010619,0.001746,0.369812,0.265958,0.274095,0.053359,0.028603,0.008174
4,5,1,1,5,0.398432,0.307515,0.197477,0.053886,0.033593,0.009097,...,0.30322,0.038299,0.007286,0.001249,0.214284,0.327662,0.322558,0.071483,0.049477,0.014537


### Load data into data_array

In [5]:
def create_all_intensity_observator(df):
    # Create an empty list to hold the concatenated data for each ID
    classes = []
    concatenated_data = []

    # Loop through each unique ID
    for ID in df['ID'].unique():
        # Filter the dataframe for the current ID
        df_temp = df[df['ID'] == ID]
        
        # List to hold the data arrays for all classes
        class_arrays = []
        
        # Loop through each unique class for the current ID
        max_c = df_temp['Class'].value_counts().max()
        min_c = df_temp['Class'].value_counts().min()
        rep_factor = max_c//min_c+1 
        for cls in df_temp['Class'].unique():
            # Filter the dataframe for the current class
            df_class = df_temp[df_temp['Class'] == cls].reset_index(drop=True)
            
            # Drop unnecessary columns
            df_class = df_class.drop(columns=['channels', 'ID', 'Class', 'Epoch'])
        
            # Convert the dataframe to a numpy array
            class_array = df_class.to_numpy()
            class_array = repmat(class_array, rep_factor, 1)
            class_array = class_array[:max_c,:]
            class_arrays.append(class_array)
        print(f'ID{ID:02d}: Repetition factor {rep_factor:02d} - Max is: {max_c:02d} - Min is: {min_c:02d}') 
        classes = classes+ [clusters[ID-1]]*max_c
        concatenated_data.append(np.concatenate((class_arrays),axis=1).tolist())
    final_array = np.concatenate((concatenated_data))

    channels = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T7', 'T8', 'P7', 'P8', 'Fz', 'Cz', 'Pz', 'AFz', 'CPz', 'POz']
    patterns = ['_d1', '_d2', '_d3', '_d4', '_d5', '_a1']
    mod_intensity = ['_A1','_A2','_A3','_A4','_V1','_V2','_V3','_V4','_C1','_C2','_C3','_C4']
    column_names = [f'{ch}_{pt}{mi}' for ch in channels for mi in mod_intensity for pt in patterns ]
    
    # Convert the final array to a dataframe
    final_df = pd.DataFrame(final_array, columns = column_names)

    return final_df, classes

# Example usage
final_df, classes = create_all_intensity_observator(df)

ID01: Repetition factor 03 - Max is: 16 - Min is: 06
ID02: Repetition factor 02 - Max is: 20 - Min is: 17
ID03: Repetition factor 02 - Max is: 20 - Min is: 17
ID04: Repetition factor 02 - Max is: 20 - Min is: 17
ID05: Repetition factor 02 - Max is: 20 - Min is: 15
ID06: Repetition factor 02 - Max is: 20 - Min is: 17
ID07: Repetition factor 02 - Max is: 20 - Min is: 17
ID08: Repetition factor 02 - Max is: 20 - Min is: 17
ID09: Repetition factor 02 - Max is: 20 - Min is: 17
ID10: Repetition factor 02 - Max is: 20 - Min is: 17
ID11: Repetition factor 02 - Max is: 20 - Min is: 17
ID12: Repetition factor 02 - Max is: 20 - Min is: 17
ID13: Repetition factor 02 - Max is: 19 - Min is: 14
ID14: Repetition factor 02 - Max is: 20 - Min is: 17
ID15: Repetition factor 02 - Max is: 20 - Min is: 17
ID16: Repetition factor 02 - Max is: 20 - Min is: 17
ID17: Repetition factor 02 - Max is: 20 - Min is: 17
ID18: Repetition factor 02 - Max is: 20 - Min is: 13
ID19: Repetition factor 02 - Max is: 20 - Min 

In [6]:
final_df

Unnamed: 0,Fp1__d1_A1,Fp1__d2_A1,Fp1__d3_A1,Fp1__d4_A1,Fp1__d5_A1,Fp1__a1_A1,Fp1__d1_A2,Fp1__d2_A2,Fp1__d3_A2,Fp1__d4_A2,...,POz__d3_C3,POz__d4_C3,POz__d5_C3,POz__a1_C3,POz__d1_C4,POz__d2_C4,POz__d3_C4,POz__d4_C4,POz__d5_C4,POz__a1_C4
0,0.575075,0.136032,0.123487,0.093560,0.057901,0.013945,0.547423,0.179707,0.128436,0.079774,...,0.120006,0.058866,0.018769,0.004650,0.310979,0.280219,0.148740,0.145800,0.084847,0.029415
1,0.509184,0.262941,0.106593,0.072962,0.037227,0.011093,0.454395,0.294668,0.132037,0.068668,...,0.055985,0.024101,0.008343,0.001890,0.374326,0.261388,0.189394,0.089160,0.064557,0.021176
2,0.490340,0.191802,0.204283,0.070682,0.033491,0.009402,0.449821,0.230215,0.206960,0.072723,...,0.108032,0.034976,0.009704,0.002487,0.310311,0.371701,0.153067,0.090099,0.057843,0.016979
3,0.436777,0.235945,0.218578,0.060795,0.038248,0.009657,0.404348,0.271724,0.228696,0.058293,...,0.092514,0.037553,0.011085,0.002544,0.415283,0.204239,0.225839,0.088631,0.047929,0.018078
4,0.398432,0.307515,0.197477,0.053886,0.033593,0.009097,0.347098,0.329096,0.221465,0.060126,...,0.085023,0.024561,0.011938,0.003414,0.363550,0.392284,0.128789,0.063558,0.035549,0.016269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,0.520923,0.146249,0.227325,0.077945,0.023838,0.003721,0.511574,0.197145,0.186010,0.082555,...,0.221002,0.097901,0.021570,0.002329,0.465205,0.157267,0.255658,0.082686,0.033793,0.005391
671,0.440654,0.148458,0.309111,0.078313,0.019596,0.003868,0.348667,0.201904,0.304249,0.123013,...,0.199921,0.169995,0.028542,0.004070,0.304721,0.153519,0.279493,0.200074,0.053543,0.008649
672,0.301289,0.144256,0.421904,0.102509,0.026651,0.003391,0.348306,0.121300,0.403601,0.092343,...,0.202985,0.164066,0.021850,0.002545,0.430492,0.186828,0.221463,0.127088,0.029180,0.004949
673,0.405805,0.160670,0.293036,0.117079,0.020803,0.002607,0.401348,0.154556,0.297756,0.113953,...,0.193004,0.116954,0.023412,0.002485,0.412937,0.182140,0.255691,0.107749,0.036119,0.005365


In [7]:
print(len(classes))
data_array = final_df.values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_array, classes, test_size=0.2, random_state=42)

675


### Train models

In [8]:
def print_scores(name, scores):
    mean_test_accuracy = np.mean(scores['test_accuracy']) * 100
    std_test_accuracy = np.std(scores['test_accuracy']) * 100
    mean_test_precision = np.mean(scores['test_precision'])
    mean_test_recall = np.mean(scores['test_recall'])
    mean_test_f1 = np.mean(scores['test_f1'])
    mean_test_roc_auc = np.mean(scores['test_roc_auc'])
    
    print(f"------------------------------------------------------------")
    print(f"Average K-Fold Test Accuracy of {name}: {mean_test_accuracy:.2f}%")
    print(f"Std Dev of K-Fold Test Accuracy of {name}: {std_test_accuracy:.2f}%")
    print(f"Average K-Fold Test Precision of {name}: {mean_test_precision:.2f}")
    print(f"Average K-Fold Test Recall of {name}: {mean_test_recall:.2f}")
    print(f"Average K-Fold Test F1 Score of {name}: {mean_test_f1:.2f}")
    print(f"Average K-Fold Test ROC AUC of {name}: {mean_test_roc_auc:.2f}")
    print(f"------------------------------------------------------------")
    
def RForest_train(X_train, y_train, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', validation='standard', cv=5, echo=True):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                   max_features=max_features, random_state=42)
    
    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)

    model.fit(X_train, y_train)
    
    if echo:
        print_scores('Random Forest', scores)
    return scores, model


def KNN_train(X_train, y_train, hot_encode=False, n_neighbors=20, p=1, weights='distance', leaf_size=20, algorithm='auto', validation='standard', cv=5, echo=True):
    model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p, weights=weights, leaf_size=leaf_size, algorithm=algorithm)

    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)
    
    model.fit(X_train, y_train)
    
    if echo:
        print_scores('KNN', scores)
    return scores, model


def MLP_train(X_train, y_train,  hot_encode=False, hidden_layer_sizes=(100,), activation='relu', solver='adam', validation='standard', alpha=0.001, cv=5, max_iter=200, echo=True):
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size='auto', learning_rate='constant', 
                              learning_rate_init=0.001, power_t=0.5, max_iter=max_iter, shuffle=True, random_state=None, tol=0.0001, verbose=False, 
                              warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, 
                              beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)

    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    scores = cross_validate(model,X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)
    
    model.fit(X_train, y_train)

    if echo:
        print_scores('MLP', scores)
    return scores, model

def train_all(X_train, y_train):

    models = []
    scores = []
    
    score, model = KNN_train(X_train, y_train, cv = 10)
    models.append(model)
    scores.append(score)
    
    score, model =RForest_train(X_train, y_train,  cv = 10)
    models.append(model)
    scores.append(score)
    
    score, model = MLP_train(X_train, y_train, hidden_layer_sizes=(200, 100, 50), max_iter=10000, alpha=0.001, cv=10)
    models.append(model)
    scores.append(score)
    
    return models, scores

models,scores = train_all(X_train, y_train)

------------------------------------------------------------
Average K-Fold Test Accuracy of KNN: 97.96%
Std Dev of K-Fold Test Accuracy of KNN: 1.93%
Average K-Fold Test Precision of KNN: 0.96
Average K-Fold Test Recall of KNN: 1.00
Average K-Fold Test F1 Score of KNN: 0.98
Average K-Fold Test ROC AUC of KNN: 1.00
------------------------------------------------------------
------------------------------------------------------------
Average K-Fold Test Accuracy of Random Forest: 99.26%
Std Dev of K-Fold Test Accuracy of Random Forest: 1.23%
Average K-Fold Test Precision of Random Forest: 0.99
Average K-Fold Test Recall of Random Forest: 0.99
Average K-Fold Test F1 Score of Random Forest: 0.99
Average K-Fold Test ROC AUC of Random Forest: 1.00
------------------------------------------------------------
------------------------------------------------------------
Average K-Fold Test Accuracy of MLP: 99.26%
Std Dev of K-Fold Test Accuracy of MLP: 1.23%
Average K-Fold Test Precision of 

In [9]:
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

def calculate_acc(X_test, y_test_encoded, models):
    ensemble_preds = np.zeros((len(y_test_encoded), len(models)))
    accs = []
    # Loop over each model to get predictions and calculate individual accuracies
    for i, (model, name) in enumerate(zip(models, ['KNN', 'Random Forest','MLP'])):
        pred = model.predict(X_test)
        
        # Convert string predictions to numerical form
        pred_encoded = le.transform(pred)
        
        acc = np.mean(pred_encoded == y_test_encoded)
        accs.append(acc)
        print(f"{name} Classification Accuracy: {acc}")
        
        # Store the numerical predictions for ensemble averaging
        ensemble_preds[:, i] = pred_encoded

    # Average the predictions using mode for classification|
    ensemble_final_preds_encoded = mode(ensemble_preds, axis=1)[0].flatten()
    
    # Convert numerical predictions back to string labels
    ensemble_final_preds = le.inverse_transform(ensemble_final_preds_encoded.astype(int))
    
    # Calculate the ensemble accuracy
    ensemble_acc = np.mean(ensemble_final_preds == y_test)
    print(f"Ensemble Classification Accuracy: {ensemble_acc}")
    return accs, ensemble_preds
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
accs,ensemble_preds = calculate_acc(X_test, y_test_encoded, models)

KNN Classification Accuracy: 0.9925925925925926
Random Forest Classification Accuracy: 0.9925925925925926
MLP Classification Accuracy: 0.9851851851851852
Ensemble Classification Accuracy: 1.0


In [10]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif

def get_mutualInfo(X_train, X_test, y_train, number_of_f=100):
    # Calculate mutual information scores
    mutual_info_scores = mutual_info_classif(X_train, y_train)
    
    if number_of_f > 0:
        # Sort the mutual information scores in descending order and select the top n features
        sorted_indices = np.argsort(mutual_info_scores)[::-1]
        selected_features = sorted_indices[:number_of_f]
    else:
        raise ValueError("Number of features must be greater than 0.")

    # Filter the training and test sets to include only the selected features
    X_train_sf = X_train[:, selected_features]
    X_test_sf = X_test[:, selected_features]

    # Define the channels and patterns
    channels = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T7', 'T8', 'P7', 'P8', 'Fz', 'Cz', 'Pz', 'AFz', 'CPz', 'POz']
    patterns = ['d1', 'd2', 'd3', 'd4', 'd5', 'a1']
    mod_intensity = ['A1','A2','A3','A4','V1','V2','V3','V4','C1','C2','C3','C4']
    feature_names = [f'{ch}_{pt}_{mi}' for ch in channels for pt in patterns for mi in mod_intensity]
    
    # Ensure that feature_names has the same length as mutual_info_scores
    if len(feature_names) != len(mutual_info_scores):
        raise ValueError(f"The number of generated feature names ({len(feature_names)}) does not match the number of features ({len(mutual_info_scores)}) in the data.")
    
    # Print the mutual information scores of the top n features
    print(f"Top {number_of_f} most important features:")
    for i in selected_features:
        print(f"{feature_names[i]}: Mutual Information Score: {mutual_info_scores[i]:.5f}")
    
    # Print the number of selected features
    print(f"Number of features selected: {len(selected_features)}")
    
    return X_train_sf, X_test_sf

X_train_sf, X_test_sf = get_mutualInfo(X_train, X_test, y_train, number_of_f = 10 )

Top 10 most important features:
Fp1_d1_C1: Mutual Information Score: 0.15642
F4_d5_A3: Mutual Information Score: 0.14828
F4_d5_C1: Mutual Information Score: 0.14157
P3_d3_V1: Mutual Information Score: 0.14123
C4_d2_C1: Mutual Information Score: 0.14109
C3_d5_C1: Mutual Information Score: 0.14104
Fp2_a1_A3: Mutual Information Score: 0.14065
C4_d4_A3: Mutual Information Score: 0.14013
Fp2_d1_V2: Mutual Information Score: 0.14010
Fp1_d2_C4: Mutual Information Score: 0.13630
Number of features selected: 10


In [11]:
models, scores = train_all(X_train_sf, y_train)

------------------------------------------------------------
Average K-Fold Test Accuracy of KNN: 82.78%
Std Dev of K-Fold Test Accuracy of KNN: 3.32%
Average K-Fold Test Precision of KNN: 0.84
Average K-Fold Test Recall of KNN: 0.84
Average K-Fold Test F1 Score of KNN: 0.84
Average K-Fold Test ROC AUC of KNN: 0.92
------------------------------------------------------------
------------------------------------------------------------
Average K-Fold Test Accuracy of Random Forest: 87.59%
Std Dev of K-Fold Test Accuracy of Random Forest: 4.46%
Average K-Fold Test Precision of Random Forest: 0.89
Average K-Fold Test Recall of Random Forest: 0.88
Average K-Fold Test F1 Score of Random Forest: 0.88
Average K-Fold Test ROC AUC of Random Forest: 0.96
------------------------------------------------------------
------------------------------------------------------------
Average K-Fold Test Accuracy of MLP: 82.78%
Std Dev of K-Fold Test Accuracy of MLP: 3.32%
Average K-Fold Test Precision of 

In [12]:
accs, ensemble_preds = calculate_acc(X_test_sf, y_test_encoded, models)

KNN Classification Accuracy: 0.8444444444444444
Random Forest Classification Accuracy: 0.9703703703703703
MLP Classification Accuracy: 0.8592592592592593
Ensemble Classification Accuracy: 0.9333333333333333
