# Notebook

Naive approach to the real data, the exact same as "Kolory" dataset just training the model on real data one epoch at a time, no epoch concatenation, no information about event. Just the eeg signal.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from eeg_lib.utils.helpers import get_device

if torch.cuda.is_available():
    print("Number of GPU: ", torch.cuda.device_count())
    print("GPU Name: ", torch.cuda.get_device_name())


device = get_device()
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 2060
Using device: cuda


In [3]:
import pandas as pd
import numpy as np
import mne
import os
import matplotlib.pyplot as plt

In [4]:
from eeg_lib.commons.constant import DATASETS_FOLDER
from eeg_lib.data.data_loader.EEGDataExtractor import EEGDataExtractor

In [5]:
from eeg_lib.utils.engine import create_user_profiles

In [6]:
from eeg_lib.utils.helpers import compute_genuine_imposter_distances, compute_threshold_metrics, compute_f1_vs_threshold, split_test_data_for_verification

In [7]:
from eeg_lib.utils.visualisations import plot_distance_distribution_on_ax, plot_threshold_metrics, plot_f1_vs_threshold, plot_distance_distribution_return, plot_f1_vs_threshold_return, plot_threshold_metrics_return

In [None]:
DATA_DIR = f"{DATASETS_FOLDER}/Real/"

extractor = EEGDataExtractor(data_dir=DATA_DIR, lfreq=7.0)
eeg_df, participants_info = extractor.extract_dataframe()

In [9]:
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from matplotlib.colors import ListedColormap

In [10]:
from sklearn.model_selection import ParameterGrid, GroupKFold

In [11]:
import torch.nn as nn

In [12]:
from eeg_lib.data.data_loader.TDNNFeatures import extract_features, extract_psd_features
from eeg_lib.data.TDNNDataset import TDNNDataset, get_dataset
from eeg_lib.models.verification.XVector import XVectorEmbeddingModel
from eeg_lib.losses.ProxyNCALoss import ProxyNCALoss
from eeg_lib.utils.visualisations import plot_tsne
from eeg_lib.utils.visualisations import create_handles
from eeg_lib.utils.helpers import split_train_test
from eeg_lib.models.similarity.centroids import SimilarityCentroidsVerifier, get_accuracy

In [13]:
from torch.utils.tensorboard import SummaryWriter

In [14]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR, ReduceLROnPlateau

In [15]:
from eeg_lib.models.verification.XVector import get_ecapa_model, get_standard_model, pretrain, fine_tune, create_embeddings, fine_tune_arcface

In [16]:
K = 5
RUNS_FOLDER = "runs_ECAPA_real"
APPENDIX = "ECAPAv0"
grid = {
    "batch_size": [64],
    "softmax_learning_rate": [0.001],
    "proxy_learning_rate": [0.001],
    "softmax_epochs": [10],
    "proxy_epochs": [10],
    "softmax_learning_rate_decay": [0.95],
    "proxy_learning_rate_decay": [0.95],
    "augmentation": [True],
    "std": [0.02],
    "embedding_dim": [256],
    "dropout_rate": [0.25],
    "scale": [10],
    "margin": [0.1],
    "layer1_filters": [512],
    "layer2_filters": [512],
    "layer3_filters": [1024],
    "layer4_filters": [1024],
    "layer5_filters": [1500],
    "layer_1_dilatation": [1],
    "layer_2_dilatation": [2],
    "layer_3_dilatation": [3],
    "layer_1_stride": [1],
    "layer_2_stride": [1],
    "layer_3_stride": [2],
    "no_norm": [True]
}

In [17]:
eeg_df['participant_id'] = eeg_df['participant_id'].replace("11_raw", "1").replace('12_raw', '1').replace('21_raw', '2').replace('22_raw', '2')

In [18]:
X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = split_train_test(eeg_df=eeg_df, test_size=0.33, random_state=42)

Training set participants: ['2' 'AB-2025-07-26_raw']
Test set participants: ['1']
Training labels: ['2' 'AB-2025-07-26_raw']
Test labels: ['1']


In [19]:
epoch_length = X_train_tmp[0].shape[1]
div, mod = divmod(epoch_length, 50)
truncated_length = div*50

In [20]:
extracted_X_train = []
for epoch in X_train_tmp:
    extracted_X_train.append(extract_features(epoch, fs=250, trunc=truncated_length).T) 

In [21]:
extracted_X_test = []
for epoch in X_test_tmp:
    extracted_X_test.append(extract_features(epoch, fs=250, trunc=truncated_length).T)

In [22]:
le_train = LabelEncoder()
le_train.fit(y_train_tmp)
y_train_encoded = le_train.transform(y_train_tmp)

le_test = LabelEncoder()
le_test.fit(y_test_tmp)
y_test_encoded = le_test.transform(y_test_tmp)

In [23]:
X_train = np.array(extracted_X_train)
X_test = np.array(extracted_X_test)

In [24]:
scalers = {}
X_train_norm = np.empty_like(X_train)
X_test_norm = np.empty_like(X_test)

In [25]:
# scaling per feature
for f in range(X_train.shape[1]):
    scalers[f] = StandardScaler().fit(X_train[:, f, :])
    X_train_norm[:, f,:] = scalers[f].transform(X_train[:, f, :])
    X_test_norm[:, f, :] = scalers[f].transform(X_test[:, f, :])

In [26]:
num_train_classes = len(np.unique(y_train_encoded))

In [27]:
custom_colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
                 '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
                 '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
                 '#000075', '#808080', '#ffffff', '#000000', '#a9a9a9', '#ff69b4',
                 '#b0e0e6', '#32cd32', '#ff4500', '#da70d6', '#ff1493', '#7fffd4']
cmap = ListedColormap(custom_colors[:2])

custom_colors = ['#e6194b',  # Red
                 '#3cb44b',  # Green
                 '#4363d8',  # Blue
                 '#ffe119',  # Yellow
                 '#911eb4',  # Purple
                 '#f58231']  # Orange

cmap2 = ListedColormap(custom_colors[:2])

In [28]:
import matplotlib.patches as mpatches


In [29]:
unique_ids = np.unique(y_test_tmp)
test_handles = []
num_ids = len(unique_ids)
for i in range(num_ids):
    color = cmap(float(i) / (len(unique_ids)))
    patch = mpatches.Patch(color=color, label=str(unique_ids[i]))
    test_handles.append(patch)

In [30]:
train_handles = create_handles(y_train_tmp, cmap)

In [31]:
for run_idx, hparams in enumerate(ParameterGrid(grid), start=1):
    print(f"RUN NUMBER: {run_idx}")
    print("Parameters: ", hparams)
    run_name =f"run_{run_idx}"
    writer = SummaryWriter(log_dir=f"{RUNS_FOLDER}/{run_name}_{APPENDIX}")
    writer.add_hparams(hparams,{})
    
    train_loader = get_dataset(hparams,X_train_norm, y_train_encoded)
    model = pretrain(hparams, device, X_train_norm.shape[1], num_train_classes, train_loader, writer, "ECAPA2").to(device)
    model, final_loss = fine_tune_arcface(model, hparams, device, train_loader, num_train_classes, writer, return_final_loss=True)
    model = model.to("cpu")
    
    embd_train, embd_test = create_embeddings(model, X_train_norm, X_test_norm,hparams)
    centroid_train_acc, centroid_test_acc = get_accuracy(embd_train, embd_test, y_train_encoded, y_test_encoded)
    
    writer.add_scalar(f"Final_Centroid_Accuracy_/train", centroid_train_acc)
    writer.add_scalar(f"Final_Centroid_Accuracy_/test", centroid_test_acc)
    
    tsne_train = TSNE(n_components=2, random_state=42).fit_transform(embd_train)
    tsne_test = TSNE(n_components=2, random_state=42).fit_transform(embd_test)
    
    scaler = MinMaxScaler(feature_range=(0,1))
    tsne_train_scaled = scaler.fit_transform(tsne_train)
    tsne_test_scaled = scaler.transform(tsne_test)
    
    fig_train = plot_tsne(tsne_train_scaled,
      cmap,
      y_train_encoded,
      handles=train_handles,
      alpha=0.5,
      title="TSNE Visualization training data XVector",
      xlabel="TSNE embedding dimension 1",
      ylabel="TSNE embedding dimension 2",
      centroids=None,
      return_fig=True)
    # plt.show(fig_train)
    writer.add_figure(f"tsne_train", fig_train)
    fig_test = plot_tsne(tsne_test_scaled,
          cmap2,
          y_test_encoded,
          handles=test_handles,
          alpha=0.5,
          title="TSNE Visualization test data XVector",
          xlabel="TSNE embedding dimension 1",
          ylabel="TSNE embedding dimension 2",
          centroids=None, return_fig=True )
    embd_unit = embd_train / np.linalg.norm(embd_train, axis=1, keepdims=True)
    writer.add_figure(f"tsne_test", fig_test)
        # train
    user_profiles = create_user_profiles(embd_train, y_train_encoded)
    for metric in ("euclidean", "cosine"):
        genuine_dists, imposter_dists = compute_genuine_imposter_distances(
                embeddings=embd_train ,
                ids=y_train_encoded,
                user_profiles=user_profiles,
                distance_metric=metric
            )
        
        (
            thresholds, fnr_list, fpr_list, acc_list,
            best_T, best_fnr, best_fpr, best_acc
        ) = compute_threshold_metrics(genuine_dists, imposter_dists, num_thresholds=200)
        
        writer.add_scalar(f"{metric}_best_threshold_train", best_T)
        writer.add_scalar(f"{metric}_best_acc_train", best_acc)
        writer.add_scalar(f"{metric}_FNR/FPR_threshold_train", best_fnr)
        

        
        
        fig = plot_threshold_metrics_return(
            thresholds, fnr_list, fpr_list, acc_list,
            best_T, best_fnr, best_fpr, best_acc
        )
        # plt.show(fig)
        writer.add_figure(f"threshold_metrics_train_{metric}", fig)
        
        thresholds, f1_list, best_T, best_f1 = compute_f1_vs_threshold(
            genuine_dists, imposter_dists, num_thresholds=300
        )
        writer.add_scalar(f"{metric}_best_f1_train_fold", best_f1)
        

        
        fig = plot_f1_vs_threshold_return(thresholds, f1_list, best_T, best_f1)
        writer.add_figure(f"f1_vs_threshold_train_{metric}_fold", fig)
        # plt.show(fig)


    # vaL
    profile_embd, profile_ids, verify_embd, verify_ids = split_test_data_for_verification(
        embd_test, y_test_encoded, profile_ratio=0.6
    )
    
    val_user_profiles = create_user_profiles(profile_embd, profile_ids)
    for metric in ("euclidean", "cosine"):
    
        genuine_dists, imposter_dists = compute_genuine_imposter_distances(
            embeddings=verify_embd,
            ids=verify_ids,
            user_profiles=val_user_profiles,
            distance_metric=metric
        )
        
        (
            test_thresholds, test_fnr_list, test_fpr_list, test_acc_list,
            test_best_T, test_best_fnr, test_best_fpr, test_best_acc
        ) = compute_threshold_metrics(genuine_dists, imposter_dists, num_thresholds=200)
        writer.add_scalar(f"{metric}_best_threshold_test", test_best_T)
        writer.add_scalar(f"{metric}_best_acc_test_fold", test_best_acc)
        writer.add_scalar(f"{metric}_FNR/FPR_threshold_test", test_best_fnr)
        
        
        fig = plot_threshold_metrics_return(
            test_thresholds, test_fnr_list, test_fpr_list, test_acc_list,
            test_best_T, test_best_fnr, test_best_fpr, test_best_acc
        )
        # plt.show(fig)
        writer.add_figure(f"threshold_metrics_{metric}_test", fig)
        
        test_thresholds_f1, test_f1_list, test_best_T_f1, test_best_f1 = compute_f1_vs_threshold(
            genuine_dists, imposter_dists, num_thresholds=300
        )
        writer.add_scalar(f"{metric}_best_f1_test", test_best_f1)
        
        
        fig = plot_f1_vs_threshold_return(test_thresholds_f1, test_f1_list, test_best_T_f1, test_best_f1)
        # plt.show(fig)
        writer.add_figure(f"f1_vs_threshold_test_{metric}", fig)
    writer.close()

RUN NUMBER: 1
Parameters:  {'augmentation': True, 'batch_size': 64, 'dropout_rate': 0.25, 'embedding_dim': 256, 'layer1_filters': 512, 'layer2_filters': 512, 'layer3_filters': 1024, 'layer4_filters': 1024, 'layer5_filters': 1500, 'layer_1_dilatation': 1, 'layer_1_stride': 1, 'layer_2_dilatation': 2, 'layer_2_stride': 1, 'layer_3_dilatation': 3, 'layer_3_stride': 2, 'margin': 0.1, 'no_norm': True, 'proxy_epochs': 10, 'proxy_learning_rate': 0.001, 'proxy_learning_rate_decay': 0.95, 'scale': 10, 'softmax_epochs': 10, 'softmax_learning_rate': 0.001, 'softmax_learning_rate_decay': 0.95, 'std': 0.02}
[Pretrain] Epoch 1/10  Loss=0.6535  Acc=0.6106
[Pretrain] Epoch 2/10  Loss=0.5370  Acc=0.7789
[Pretrain] Epoch 3/10  Loss=0.3556  Acc=1.0000
[Pretrain] Epoch 4/10  Loss=0.3087  Acc=1.0000
[Pretrain] Epoch 5/10  Loss=0.2780  Acc=1.0000
[Pretrain] Epoch 6/10  Loss=0.2512  Acc=1.0000
[Pretrain] Epoch 7/10  Loss=0.2279  Acc=1.0000
[Pretrain] Epoch 8/10  Loss=0.2080  Acc=1.0000
[Pretrain] Epoch 9/10 