In [None]:
#| default_exp cross_validation

# Validating the results

> Ofter overlooked, preparing a good validation pipeline is crucial to getting a good model.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| exports
import os
from glob import glob
from collections import Counter
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
from fastcore.basics import patch
from fastcore.foundation import L

from sleepstagingidal.data import *
from sleepstagingidal.dataa import *
from sleepstagingidal.dataa import swap_dict
from sleepstagingidal.feature_extraction import *

In [None]:
import matplotlib.pyplot as plt
import mne
import yasa
from rich.progress import track

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

In [None]:
#| hide
path_data = "/media/2tbraid/antonia/PSG/"

In [None]:
path_files = glob(os.path.join(path_data, "*.edf"))

In [None]:
channels = ["C3", "C4", "A1", "A2", "O1", "O2", "LOC", "ROC", "LAT1", "LAT2", "ECGL", "ECGR", "CHIN1", "CHIN2"]

## Patient-Fold

Before trying a lot of different configurations for the models or different feature extraction techniques, it's crucial to set up a truthful way of knowing how are this changes affecting our results. Because of that, we're going to lay out the fundation of our validation pipeline: the Patient-Fold.

By similarity with traditional K-Fold, we are going to separate all the recordings we have and, iteratively, train with some of them while testing with a different set. This way of performing cross-validation will give us a good estimate on the inter-patient generalization capability of the model.

In [None]:
from sklearn.model_selection import KFold

In [None]:
#| export

class PatientFold():
    """Manager to perform the so-called PatientFold."""
    def __init__(self,
                 path_files: List[str], # Path to the `.edf` files we want to use.
                 n_splits: int, # Number of folds to use.
                 random_state: int, # Random seed for reproducibility
                 ): 
        self.path_files = path_files
        self.n_splits = n_splits
        self.random_state = random_state
        self.folds = KFold(len(path_files))
        self._patients = None

Loading and preprocessing the raw `.edf` files takes quite a lot of time, so it can be very convenient to separate that part from the cross-validation part. Keep in mind that we can do this without collapsin the memory from the server because the loaded files themselves load the data in a lazy way. The best way to ensure that the loading and preprocessing is done only once is to use a `property`:

In [None]:
#| export

@patch(as_prop=True)
def patients(self: PatientFold):
    """Ensures that the `.edf` files are only loaded and preprocessed once."""
    if self._patients is None:
        self._patients = L([read_clean_edf(path, resample=100, bandpass=(0.3, 49)) for path in track(self.path_files, description="Pre-processing recordings")])
    return self._patients

We know that different recordings may have different encodings for the same sleep stage, so we should be unifying them before joining data from different recordings. The easiest way to do it is turning them into their human-readable representation, and encode all of them together to ensure that all of them are encoded in the same way.

In [None]:
#| exporti feature_extraction

def unify_labels(events: List[np.array], # List of events corresponding to different recordings encoded.
                 mappings: List[Dict], # List of mappings to turn the encoded labels into human-readable labels.
                 ) -> List[List[str]]: # List of labels arrays corresponding to different recordings in human-redable form.
    return [map_events(events_, swap_dict(mapping)) for events_, mapping in zip(events, mappings)]

In [None]:
#| exporti feature_extraction

def unify_labels_from_epochs(epochs: List[mne.epochs.Epochs], # List of `mne.epochs.Epochs`.
                             ) -> List[List[str]]: # List of labels arrays corresponding to different recordings in human-redable form.
    events = [e.events for e in epochs]
    mappings = [e.event_id for e in epochs]
    return [map_events(events_, swap_dict(mapping)) for events_, mapping in zip(events, mappings)]

And finally, we can build a simple function to build the appropriate input data and its labels from a set of patients loaded:

In [None]:
#| exporti feature_extraction

def get_trainable_from_patients(patients: List[mne.io.edf.edf.RawEDF], # List of loaded Raw `.edf` files.
                                channels: List[str], # Channels to be used.
                                feature_extraction_fn, # Function to be applied to the `Epochs` to extract features.
                                ) -> Tuple[np.array, np.array]: # X and Y data ready to be used to train a model.
    """
    Extract epochs and features from `patients` and concatenate all of them 
    so that the output can be used to directly train a model.
    """
    features_all, labels_all = [], []
    for patient in track(patients, description="Building data from recordings..."):
        epochs, sr = get_epochs(patient, channels=channels)
        features = feature_extraction_fn(epochs)
        labels = map_events(epochs.events, swap_dict(epochs.event_id))
        features_all.append(features)
        labels_all.append(labels)
    features_all, labels_all = np.concatenate(features_all), np.concatenate(labels_all)
    return features_all, labels_all

We want the process to be as streamlined as possible, so we can implement a `.fit()` method to quickly perform the Patient-Fold with any estimator:

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
#|export

@patch
def fit(self: PatientFold,
        estimator, # Any object implementing a `.fit()` method to be crossvalidated. Must not be instantiated.
        **kwargs, # Key-word arguments to be passed to the estimator at instance time.
        ): # Results from the Patiend-Fold.
    """
    Performs the cross-validation loop by training the `estimator` on the different folds
    and returns the results.
    """
    results = {"train":[], "test":[], "model":[]}
    for train_idx, test_idx in self.folds.split(self.patients):
        ## Separate according to the indexes
        train_patients = self.patients[train_idx]
        test_patients = self.patients[test_idx]
        
        ## Build data
        X_train, Y_train = get_trainable_from_patients(train_patients, channels=channels, feature_extraction_fn=calculate_bandpower)
        X_test, Y_test = get_trainable_from_patients(test_patients, channels=channels, feature_extraction_fn=calculate_bandpower)

        ## Encode labels
        le = LabelEncoder()
        le.fit(Y_train)
        Y_train, Y_test = le.transform(Y_train), le.transform(Y_test)
        
        ## Train the model
        model = estimator(**kwargs, random_state=self.random_state)
        model.fit(X_train, Y_train)

        ## Obtain the metrics of interest
        results["train"].append(model.score(X_train, Y_train))
        results["test"].append(model.score(X_test, Y_test))
        results["model"].append(model)

    return results

In [None]:
pf = PatientFold(path_files=path_files[:2],
                 n_splits=len(path_files[:2]),
                 random_state=42)

In [None]:
pf.fit(RandomForestClassifier)

Output()

Output()

Output()

Output()

Output()

ValueError: y contains previously unseen labels: 'Sleep stage N3'