# Notebook
This notebook is used for running grid search on the EEGDataExtractor 

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json

In [4]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 2060
Using device: cuda


In [5]:
import pandas as pd
import numpy as np
import mne
import os
import matplotlib.pyplot as plt

In [6]:
from eeg_lib.commons.constant import DATASETS_FOLDER
from eeg_lib.data.data_loader.EEGDataExtractor import EEGDataExtractor

In [7]:
from eeg_lib.utils.engine import create_user_profiles

In [8]:
from eeg_lib.utils.helpers import compute_genuine_imposter_distances, compute_threshold_metrics, compute_f1_vs_threshold, split_test_data_for_verification

In [9]:
from eeg_lib.utils.visualisations import plot_distance_distribution_on_ax, plot_threshold_metrics, plot_f1_vs_threshold, plot_distance_distribution_return, plot_f1_vs_threshold_return, plot_threshold_metrics_return

In [10]:
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from matplotlib.colors import ListedColormap

In [11]:
from sklearn.model_selection import ParameterGrid

In [12]:
import torch.nn as nn

In [13]:
from eeg_lib.data.data_loader.TDNNFeatures import extract_features, extract_psd_features
from eeg_lib.data.TDNNDataset import TDNNDataset, get_dataset
from eeg_lib.models.verification.XVector import XVectorEmbeddingModel
from eeg_lib.losses.ProxyNCALoss import ProxyNCALoss
from eeg_lib.utils.visualisations import plot_tsne
from eeg_lib.utils.visualisations import create_handles
from eeg_lib.utils.helpers import split_train_test
from eeg_lib.models.similarity.centroids import SimilarityCentroidsVerifier, get_accuracy

In [14]:
from torch.utils.tensorboard import SummaryWriter

In [15]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR, ReduceLROnPlateau

In [16]:
from eeg_lib.models.verification.XVector import get_ecapa_model, get_standard_model, pretrain, fine_tune, create_embeddings, fine_tune_arcface

Grid to search over

In [22]:
grid = {
    "lfreq": [3.0,7.0,10.0],
    "hfreq": [100.0],
    "tmin": [0.0],
    "tmax": [2.0,5.0,7.0],
    "notch_filter": [[50]]
}

In [23]:
results = []

In [24]:
JSON_FILE_NAME = "eeg_extractor_grid_search_v2.json"

In [25]:
DATA_DIR = f"{DATASETS_FOLDER}/Kolory/"

Some pretested reasonable hyperparameters for the model itself

In [26]:
hparams = {
    "batch_size" : 64,
    "softmax_learning_rate" : 0.001,
    "proxy_learning_rate" : 0.001,
    "softmax_epochs" : 70,
    "proxy_epochs" : 100,
    "softmax_learning_rate_decay" : 0.95,
    "proxy_learning_rate_decay" : 0.95,
    "augmentation" : True,
    "std" : 0.02,
    "embedding_dim" : 256,
    "dropout_rate" : 0.25,
    "scale" : 10,
    "margin" : 0.1,
    "layer1_filters" : 512,
    "layer2_filters" : 512,
    "layer3_filters" : 1024,
    "layer4_filters" : 1024,
    "layer5_filters" : 1500,
    "layer_1_dilatation" : 1,
    "layer_2_dilatation" : 2,
    "layer_3_dilatation" : 3,
    "layer_1_stride" : 1,
    "layer_2_stride" : 1,
    "layer_3_stride" : 2,
    "no_norm" : True
}

In [27]:
def preprocess_data(eeg_df, test_size=0.2, random_state=42):
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = split_train_test(eeg_df=eeg_df,test_size=test_size, random_state=random_state)
    epoch_length = X_train_tmp[0].shape[1]
    div, mod = divmod(epoch_length, 50)
    print(div)
    print(mod)
    
    # print(epoch_length)
    # print(epoch_length%50)
    truncated_length = div*50
    print(truncated_length)
    
    extracted_X_train = []
    for epoch in X_train_tmp:
        extracted_X_train.append(extract_features(epoch, fs=250, trunc=truncated_length).T)
    extracted_X_test = []        
    for epoch in X_test_tmp:
        extracted_X_test.append(extract_features(epoch, fs=250,trunc=truncated_length).T)
    le_train = LabelEncoder()
    le_train.fit(y_train_tmp)
    y_train_encoded = le_train.transform(y_train_tmp)
    
    le_test = LabelEncoder()
    le_test.fit(y_test_tmp)
    y_test_encoded = le_test.transform(y_test_tmp)
    X_train = np.array(extracted_X_train)
    X_test = np.array(extracted_X_test)
    scalers = {}
    X_train_norm = np.empty_like(X_train)
    X_test_norm = np.empty_like(X_test)
    # scaling per feature
    for f in range(X_train.shape[1]):
        scalers[f] = StandardScaler().fit(X_train[:, f, :])
        X_train_norm[:, f, :] = scalers[f].transform(X_train[:, f, :])
        X_test_norm[:, f, :] = scalers[f].transform(X_test[:, f, :])
    return X_train_norm, y_train_encoded, X_test_norm, y_test_encoded

here we track:
- the parameters used
- the model's final loss
- centroid_train_acc
- centroid_test_acc,
- final_train_acc
- final_train_eer 
- final_train_f1
- final_test_acc
- final_test_f1
- final_test_eer

with the most important ones being:
- final loss
- final test acc
- final test f1
- final test eer

In [None]:
for run_idx, params in enumerate(ParameterGrid(grid), start=1):
    print(f"RUN NUMBER: {run_idx}")
    print("Parameters: ", params)
    extractor = EEGDataExtractor(
        data_dir=DATA_DIR,
        lfreq=params['lfreq'],
        hfreq=params['hfreq'],
        tmin=params['tmin'],
        tmax=params['tmax'],
        notch_filter=params['notch_filter']
    )
    
    
    eeg_df, participant_info = extractor.extract_dataframe()
    X_train_norm, y_train_encoded, X_test_norm, y_test_encoded = preprocess_data(eeg_df=eeg_df)
    
    num_train_classes = len(np.unique(y_train_encoded))
    
    train_loader = get_dataset(hparams,X_train_norm, y_train_encoded)
    model = pretrain(hparams, device, X_train_norm.shape[1], num_train_classes, train_loader, None, "ECAPA2").to(device)
    model, final_loss = fine_tune_arcface(model, hparams, device, train_loader, num_train_classes, None, return_final_loss=True)
    model = model.to("cpu")
    
    embd_train, embd_test = create_embeddings(model, X_train_norm, X_test_norm,hparams)
    centroid_train_acc, centroid_test_acc = get_accuracy(embd_train, embd_test, y_train_encoded, y_test_encoded)
    user_profiles = create_user_profiles(embd_train, np.array(y_train_encoded))
    genuine_dists, imposter_dists = compute_genuine_imposter_distances(
        embeddings=embd_train ,
        ids=np.array(y_train_encoded),
        user_profiles=user_profiles,
        distance_metric="cosine"
    )
    
    (
        thresholds, fnr_list, fpr_list, acc_list,
        best_T, best_fnr, best_fpr, best_acc
    ) = compute_threshold_metrics(genuine_dists, imposter_dists, num_thresholds=200)
    
    thresholds, f1_list, best_T, best_f1 = compute_f1_vs_threshold(
    genuine_dists, imposter_dists, num_thresholds=300
    )
    # test
    profile_embd, profile_ids, verify_embd, verify_ids = split_test_data_for_verification(
        embd_test, np.array(y_test_encoded), profile_ratio=0.6
    )
    
    test_user_profiles = create_user_profiles(profile_embd, profile_ids)
    
    genuine_dists, imposter_dists = compute_genuine_imposter_distances(
    embeddings=verify_embd,
    ids=verify_ids,
    user_profiles=test_user_profiles,
    distance_metric="cosine"
    )
        
    (
        test_thresholds, test_fnr_list, test_fpr_list, test_acc_list,
        test_best_T, test_best_fnr, test_best_fpr, test_best_acc
    ) = compute_threshold_metrics(genuine_dists, imposter_dists, num_thresholds=200)
    
    test_thresholds_f1, test_f1_list, test_best_T_f1, test_best_f1 = compute_f1_vs_threshold(
    genuine_dists, imposter_dists, num_thresholds=300
    )
    
    results.append({
        "params": params,
        "final_loss": final_loss,
        "centroid_train_acc": centroid_train_acc,
        "centroid_test_acc": centroid_test_acc,
        "final_train_acc": best_acc,
        "final_train_eer": best_fnr, 
        "final_train_f1": best_f1,
        "final_test_acc": test_best_acc,
        "final_test_f1": test_best_f1,
        "final_test_eer": test_best_fnr
    })
with open(JSON_FILE_NAME, "w") as f:
    json.dump(results, f, indent=2)