NeuroTechX · PierreGtch · Apr 11, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst
@@ -100,7 +100,10 @@ DOI: https://doi.org/10.1088/1741-2552/ac38cf
    :header: Dataset, #Subj, #Chan, #Classes, #Trials / class, Trials length, #Epochs / class, Sampling rate, #Sessions, Codes, Presentation rate
    :class: sortable
 
-   :class:`Thielen2021`,30,8,20,5,31.5s,18900 NT / 18900 T,512Hz,1,Gold codes,60Hz
+   :class:`Thielen2015`,12,64,36,3,4.2s,27216 NT / 27216 T,2048Hz,1,Gold codes,120Hz
+   :class:`Thielen2021`,30,8,20,5,31.5s,94500 NT / 94500 T,512Hz,1,Gold codes,60Hz
+
+
 
 
 Resting States

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -83,6 +83,7 @@ c-VEP Datasets
     :toctree: generated/
     :template: class.rst
 
+    Thielen2015
     Thielen2021
 
 

diff --git a/docs/source/whats_new.rst b/docs/source/whats_new.rst
@@ -78,7 +78,8 @@ Enhancements
 - Add match_all method in paradigm to support CompoundDataset evaluation with MNE epochs (:gh:`473` by `Gregoire Cattan`_)
 - Automate setting of event_id in compound dataset and add `data_origin` information to the data (:gh:`475` by `Gregoire Cattan`_)
 - Add possibility of not saving the model (:gh:`489` by `Igor Carrara`_)
-- Add CVEP and BurstVEP dataset from Castillos from Toulouse lab (by `Seabstien Velut`_)
+- Add CVEP and BurstVEP dataset from Castillos from Toulouse lab (:gh:`531` by `Seabstien Velut`_)
+- Add c-VEP dataset from Thielen et al. 2015 (:gh:`557` by `Jordy Thielen`_)
 
 Bugs
 ~~~~
@@ -108,6 +109,8 @@ Bugs
 - Fix case when events specified via ``raw.annotations`` but no events (:gh:`491` by `Pierre Guetschel`_)
 - Fix bug in downloading Shin2017A dataset (:gh:`493` by `Igor Carrara`_)
 - Fix the cropped option in the dataset preprocessing (:gh:`502` by `Bruno Aristimunha`_)
+- Fix bug in :func:`moabb.datasets.utils.dataset_search` with missing cvep paradigm (:gh:`557` by `Jordy Thielen`_)
+- Fix mistakes in :func:`moabb.datasets.thielen2021` considering wrong docs and hardcoded trial stim channel (:gh:`557` by `Jordy Thielen`_)
 
 API changes
 ~~~~~~~~~~~

diff --git a/moabb/datasets/__init__.py b/moabb/datasets/__init__.py
@@ -66,6 +66,7 @@
 from .ssvep_mamem import MAMEM1, MAMEM2, MAMEM3
 from .ssvep_nakanishi import Nakanishi2015
 from .ssvep_wang import Wang2016
+from .thielen2015 import Thielen2015
 from .thielen2021 import Thielen2021
 from .upper_limb import Ofner2017
 from .utils import _init_dataset_list

diff --git a/moabb/datasets/thielen2015.py b/moabb/datasets/thielen2015.py
@@ -0,0 +1,274 @@
+import mne
+import numpy as np
+from mne import create_info
+from mne.io import RawArray
+from scipy.io import loadmat
+
+from moabb.datasets import download as dl
+from moabb.datasets.base import BaseDataset
+
+
+Thielen2015_URL = "https://public.data.ru.nl/dcc/DSC_2018.00047_553_v3"
+
+# Each session consisted of 3 fixed-length trials runs
+NR_RUNS = 3
+
+# Each trial contained 4 cycles of a 1.05 second code
+NR_CYCLES_PER_TRIAL = 4
+
+# Codes were presented at a 120 Hz monitor refresh rate
+PRESENTATION_RATE = 120
+
+
+class Thielen2015(BaseDataset):
+    """c-VEP dataset from Thielen et al. (2015)
+
+    Dataset [1]_ from the study on reconvolution for c-VEP [2]_.
+
+    .. admonition:: Dataset summary
+
+        =============  =======  =======  ==================  ===============  ===============  ===========
+        Name             #Subj    #Chan     #Trials / class  Trials length    Sampling rate      #Sessions
+        =============  =======  =======  ==================  ===============  ===============  ===========
+        Thielen2015         12       64  27216 NT / 27216 T  0.3s             2048Hz                     1
+        =============  =======  =======  ==================  ===============  ===============  ===========
+
+    **Dataset description**
+
+    EEG recordings were obtained with a sampling rate of 2048 Hz, using a setup comprising 64 Ag/AgCl electrodes, and
+    amplified by a Biosemi ActiveTwo EEG amplifier. Electrode placement followed the international 10-10 system.
+
+    During the experimental sessions, participants actively operated a 6 x 6 visual speller brain-computer interface
+    (BCI) with real-time feedback, encompassing 36 distinct classes. Each cell within the symbol grid underwent
+    luminance modulation at full contrast, achieved through the application of pseudo-random noise-codes derived from a
+    set of modulated Gold codes. These binary codes have a balanced distribution of ones and zeros while adhering to a
+    limited run-length pattern, with a maximum run-length of 2 bits. Codes were presented at a rate of 120 Hz. Given
+    that one cycle of these modulated Gold codes comprises 126 bits, the duration of a complete cycle spans 1.05
+    seconds.
+
+    Throughout the experiment, participants underwent four distinct blocks: an initial practice block consisting of two
+    runs, followed by a training block of one run. Subsequently, they engaged in a copy-spelling block comprising six
+    runs, and finally, a free-spelling block consisting of one run. Between the training and copy-spelling block, a
+    classifier was calibrated using data from the training block. This calibrated classifier was then applied during
+    both the copy-spelling and free-spelling runs. Additionally, during calibration, the stimulation codes were
+    tailored and optimized specifically for each individual participant.
+
+    Among the six copy-spelling runs, there were three fixed-length runs. Trials in these runs started with a cueing
+    phase, where the target symbol was highlighted in a green hue for 1 second. Participants maintained their gaze
+    fixated on the target symbol as all symbols flashed in sync with their corresponding pseudo-random noise-codes for a
+    duration of 4.2 seconds (equivalent to 4 code cycles). Immediately following this stimulation, the output of the
+    classifier was shown by coloring the cell blue for 1 second. Each run consisted of 36 trials, presented in a
+    randomized order.
+
+    Here, our focus is solely on the three copy-spelling runs characterized by fixed-length trials lasting 4.2 seconds
+    (equivalent to four code cycles). The other three runs utilized a dynamic stopping procedure, resulting in trials of
+    varying durations, rendering them unsuitable for benchmarking purposes. Similarly, the practice and free-spelling
+    runs included dynamic stopping and are ignored in this dataset. The training dataset, comprising 36 trials, used a
+    different noise-code set, and is therefore also ignored in this dataset. In total, this dataset should contain 108
+    trials of 4.2 seconds each, with 3 repetitions for each of the 36 codes.
+
+    References
+    ----------
+
+    .. [1] Thielen, J. (Jordy), Jason Farquhar, Desain, P.W.M. (Peter) (2023): Broad-Band Visually Evoked Potentials:
+           Re(con)volution in Brain-Computer Interfacing. Version 2. Radboud University. (dataset).
+           DOI: https://doi.org/10.34973/1ecz-1232
+
+    .. [2] Thielen, J., Van Den Broek, P., Farquhar, J., & Desain, P. (2015). Broad-Band visually evoked potentials:
+           re(con)volution in brain-computer interfacing. PLOS ONE, 10(7), e0133797.
+           DOI: https://doi.org/10.1371/journal.pone.0133797
+
+    Notes
+    -----
+
+    .. versionadded:: 1.0.0
+
+    """
+
+    def __init__(self):
+        super().__init__(
+            subjects=list(range(1, 12 + 1)),
+            sessions_per_subject=1,
+            events={"1.0": 101, "0.0": 100},
+            code="Thielen2015",
+            interval=(0, 0.3),
+            paradigm="cvep",
+            doi="10.34973/1ecz-1232",
+        )
+
+    def _add_stim_channel_trial(
+        self, raw, onsets, labels, offset=200, ch_name="stim_trial"
+    ):
+        """
+        Add a stimulus channel with trial onsets and their labels.
+
+        Parameters
+        ----------
+        raw: mne.Raw
+            The raw object to add the stimulus channel to.
+        onsets: List | np.ndarray
+            The onsets of the trials in sample numbers.
+        labels: List | np.ndarray
+            The labels of the trials.
+        offset: int (default: 200)
+            The integer value to start markers with. For instance, if 200, then label 0 will be marker 200, label 1
+            will be be marker 201, etc.
+        ch_name: str (default: "stim_trial")
+            The name of the added stimulus channel.
+        Returns
+        -------
+        mne.Raw
+            The raw object with the added stimulus channel.
+        """
+        stim_chan = np.zeros((1, len(raw)))
+        for onset, label in zip(onsets, labels):
+            stim_chan[0, onset] = offset + label
+        info = create_info(
+            ch_names=[ch_name],
+            ch_types=["stim"],
+            sfreq=raw.info["sfreq"],
+            verbose=False,
+        )
+        raw = raw.add_channels([RawArray(data=stim_chan, info=info, verbose=False)])
+        return raw
+
+    def _add_stim_channel_epoch(
+        self,
+        raw,
+        onsets,
+        labels,
+        codes,
+        presentation_rate=60,
+        offset=100,
+        ch_name="stim_epoch",
+    ):
+        """
+        Add a stimulus channel with epoch onsets and their labels, which are the values of the presented code for each
+        of the trials.
+
+        Parameters
+        ----------
+        raw: mne.Raw
+            The raw object to add the stimulus channel to.
+        onsets: List | np.ndarray
+            The onsets of the trials in sample numbers.
+        labels: List | np.ndarray
+            The labels of the trials.
+        codes: np.ndarray
+            The codebook containing each presented code of shape (nr_bits, nr_codes), sampled at the presentation rate.
+        presentation_rate: int (default: 60):
+            The presentation rate (e.g., frame rate) at which the codes were presented in Hz.
+        offset: int (default: 100)
+            The integer value to start markers with. For instance, if 100, then label 0 will be marker 100, label 1
+            will be be marker 101, etc.
+        ch_name: str (default: "stim_epoch")
+            The name of the added stimulus channel.
+        Returns
+        -------
+        mne.Raw
+            The raw object with the added stimulus channel.
+        """
+        stim_chan = np.zeros((1, len(raw)))
+        for onset, label in zip(onsets, labels):
+            idx = np.round(
+                onset + np.arange(codes.shape[0]) / presentation_rate * raw.info["sfreq"]
+            ).astype("int")
+            stim_chan[0, idx] = offset + codes[:, label]
+        info = create_info(
+            ch_names=[ch_name],
+            ch_types=["stim"],
+            sfreq=raw.info["sfreq"],
+            verbose=False,
+        )
+        raw = raw.add_channels([RawArray(data=stim_chan, info=info, verbose=False)])
+        return raw
+
+    def _get_single_subject_data(self, subject):
+        """Return the data of a single subject."""
+        file_path_list = self.data_path(subject)
+
+        # Channels
+        montage = mne.channels.read_custom_montage(file_path_list[-1])
+
+        # There is only one session, each of 3 runs
+        sessions = {"0": {}}
+        for i_b in range(NR_RUNS):
+            # EEG
+            raw = mne.io.read_raw_gdf(
+                file_path_list[2 * i_b],
+                stim_channel="status",
+                preload=True,
+                verbose=False,
+            )
+
+            # Drop redundant ANA and EXG channels
+            ana = [f"ANA{1 + i}" for i in range(32)]
+            exg = [f"EXG{1 + i}" for i in range(8)]
+            raw.drop_channels(ana + exg)
+
+            # Set electrode positions
+            raw.set_montage(montage)
+
+            # Read info file
+            tmp = loadmat(file_path_list[2 * i_b + 1])
+
+            # Labels at trial level (i.e., symbols)
+            trial_labels = tmp["labels"].astype("uint8").flatten() - 1
+
+            # Codes (select optimized subset and layout, and repeat to trial length)
+            subset = (
+                tmp["subset"].astype("uint8").flatten() - 1
+            )  # the optimized subset of 36 codes from a set of 65
+            layout = (
+                tmp["layout"].astype("uint8").flatten() - 1
+            )  # the optimized position of the 36 codes in the grid
+            codes = tmp["codes"][:, subset[layout]]
+            codes = np.tile(codes, (NR_CYCLES_PER_TRIAL, 1))
+
+            # Find onsets of trials
+            events = mne.find_events(raw, verbose=False)
+            trial_onsets = events[:, 0]
+
+            # Create stim channel with trial information (i.e., symbols)
+            # Specifically: 200 = symbol-0, 201 = symbol-1, 202 = symbol-2, etc.
+            raw = self._add_stim_channel_trial(
+                raw, trial_onsets, trial_labels, offset=200
+            )
+
+            # Create stim channel with epoch information (i.e., 1 / 0, or on / off)
+            # Specifically: 100 = "0", 101 = "1"
+            raw = self._add_stim_channel_epoch(
+                raw, trial_onsets, trial_labels, codes, PRESENTATION_RATE, offset=100
+            )
+
+            # Add data as a new run
+            run_name = str(i_b)
+            sessions["0"][run_name] = raw
+
+        return sessions
+
+    def data_path(
+        self, subject, path=None, force_update=False, update_path=None, verbose=None
+    ):
+        """Return the data paths of a single subject."""
+        if subject not in self.subject_list:
+            raise (ValueError("Invalid subject number"))
+
+        sub = f"sub-{subject:02d}"
+        subject_paths = []
+        for i_b in range(NR_RUNS):
+            blk = f"test_sync_{1 + i_b:d}"
+
+            # EEG
+            url = f"{Thielen2015_URL:s}/sourcedata/{sub}/{blk}/{sub}_{blk}.gdf"
+            subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+            # Labels at trial level (i.e., symbols)
+            url = f"{Thielen2015_URL:s}/sourcedata/{sub}/{blk}/{sub}_{blk}.mat"
+            subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+        # Channel locations
+        url = f"{Thielen2015_URL:s}/resources/biosemi64.loc"
+        subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+        return subject_paths
diff --git a/moabb/datasets/thielen2021.py b/moabb/datasets/thielen2021.py
@@ -9,7 +9,7 @@
 from moabb.datasets.base import BaseDataset
 
 
-Thielen2021_URL = "https://public.data.donders.ru.nl/dcc/DSC_2018.00122_448_v3"
+Thielen2021_URL = "https://public.data.ru.nl/dcc/DSC_2018.00122_448_v3"
 
 # The default electrode locations in the raw file are wrong. We used the ExG channels on the Biosemi with a custom 8
 # channel set, according to an optimization as published in the following article:
@@ -80,7 +80,7 @@ class Thielen2021(BaseDataset):
         =============  =======  =======  ==================  ===============  ===============  ===========
         Name             #Subj    #Chan     #Trials / class  Trials length    Sampling rate      #Sessions
         =============  =======  =======  ==================  ===============  ===============  ===========
-        Thielen2021         30        8  18900 NT / 18900 T  0.3s             512Hz                      1
+        Thielen2021         30        8  94500 NT / 94500 T  0.3s             512Hz                      1
         =============  =======  =======  ==================  ===============  ===============  ===========
 
     **Dataset description**
@@ -168,7 +168,7 @@ def _add_stim_channel_trial(
         for onset, label in zip(onsets, labels):
             stim_chan[0, onset] = offset + label
         info = create_info(
-            ch_names=["stim_trial"],
+            ch_names=[ch_name],
             ch_types=["stim"],
             sfreq=raw.info["sfreq"],
             verbose=False,

diff --git a/moabb/datasets/utils.py b/moabb/datasets/utils.py
@@ -30,7 +30,7 @@ def dataset_search(  # noqa: C901
     Parameters
     ----------
     paradigm: str | None
-        'imagery', 'p300', 'ssvep', None
+        'imagery', 'p300', 'ssvep', 'cvep', None
 
     multi_session: bool
         if True only returns datasets with more than one session per subject.
@@ -62,7 +62,7 @@ def dataset_search(  # noqa: C901
         n_classes = len(events)
     else:
         n_classes = None
-    assert paradigm in ["imagery", "p300", "ssvep", None]
+    assert paradigm in ["imagery", "p300", "ssvep", "cvep", None]
 
     for type_d in dataset_list:
         if type_d.__name__ in deprecated_names: