<a href="https://colab.research.google.com/github/MWalidJ/time-series-anomaly-detection/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

this is an example of how to make a one recording loading

In [None]:
import mne
import numpy as np

# Assuming 'your_eeg_file.edf' is the path to your EEG file
raw = mne.io.read_raw_edf('your_eeg_file.edf', preload=True)
sampling_rate = raw.info['sfreq']
duration = 23.6  # seconds
n_samples = int(sampling_rate * duration)

# Get the data (channels x time points)
eeg_data = raw.get_data()

# Get the data for the first 'sample' (first 23.6 seconds)
one_sample_data = eeg_data[:, :n_samples]


Now to work on all data.

This this an example for the steps of:

1- load data
2-preprocessing
3-segmenting

In [None]:
import mne
import numpy as np
import os

def preprocess_eeg(file_path, selected_channels=None, sfreq=256, l_freq=1.0, h_freq=40.0):
    """
    Load and preprocess EEG file: filtering, resampling, normalization.

    Args:
        file_path: path to .edf EEG file
        selected_channels: list of channel names to pick (optional)
        sfreq: target sampling frequency
        l_freq: low frequency cut-off for bandpass filter
        h_freq: high frequency cut-off for bandpass filter

    Returns:
        preprocessed_data: np.array of shape (channels, samples)
    """
    # Load raw EEG
    raw = mne.io.read_raw_edf(file_path, preload=True)

    # Pick desired channels (optional)
    if selected_channels is not None:
        raw.pick_channels(selected_channels)

    # Bandpass filter
    raw.filter(l_freq=l_freq, h_freq=h_freq)

    # Resample
    raw.resample(sfreq)

    # Get data
    data = raw.get_data()  # shape: [channels, samples]

    # Z-score normalization per channel
    data = (data - data.mean(axis=1, keepdims=True)) / (data.std(axis=1, keepdims=True) + 1e-8)

    return data

def segment_eeg(data, sfreq=256, window_sec=2, overlap=0.5):
    """
    Segment preprocessed EEG into overlapping windows.

    Args:
        data: np.array of shape (channels, samples)
        sfreq: sampling frequency
        window_sec: window size in seconds
        overlap: fraction of overlap between windows

    Returns:
        segments: np.array of shape (num_windows, channels, window_size)
    """
    n_channels, n_samples = data.shape
    window_size = int(window_sec * sfreq)
    step_size = int(window_size * (1 - overlap))

    segments = []
    for start_idx in range(0, n_samples - window_size + 1, step_size):
        end_idx = start_idx + window_size
        window = data[:, start_idx:end_idx]
        segments.append(window)

    segments = np.stack(segments, axis=0)
    return segments

# Example Usage:

if __name__ == "__main__":
    # Parameters
    filepath = 'path/to/your/eeg_file.edf'  # <-- Update this
    selected_channels = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4', 'T6', 'O1', 'O2']
    window_sec = 2
    overlap = 0.5
    target_sfreq = 256

    # Preprocess
    data = preprocess_eeg(filepath, selected_channels, sfreq=target_sfreq)

    # Segment
    segments = segment_eeg(data, sfreq=target_sfreq, window_sec=window_sec, overlap=overlap)

    print(f"Final segmented data shape: {segments.shape}")
    # Example output: (num_windows, 19, 512)

now, the questions needed to be answered

1.   Do we need to select channels and drop out others?
 (not an answer but we should note that the naming of channels might be different and that not all channels are included for all subjects or we might even have some corrupted channels)
2.   Do we need to filter frequency?
 (for this we might need to do frequency analysis and time-frequency spectral analysis
)3.   What is the sample rate ( I have read that the standard is 256 but we have to look more in the papers how did they select and what is the rate they used )
4.   What is the overlapping for the windows? maybe we can start with 0.5?
5.   select the windows time (1,2,3,4,5 seconds?) # timesteps(samples) per window = window number of secs * number of Hz(sample rate)
6.   
please walid add more if you have any question in your mind


now that we have preprocessed the data, we need a dataloader to allow ( batching, shuffling, parallizing ) for the dataset in order to feed the model.

I have read that for large datasets such as ours, typical dataloading loads all files at once, which can overwhelm the RAM and make it crash.

Thus, a technique called "Lazy Loading" is used to load only data that are called (loading on-demand)


In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class EEGWindowLazyDataset(Dataset):
    def __init__(self, data_folder, window_shape, transform=None):
        """
        Lazy-loading EEG dataset. Loads one window at a time from file.

        Args:
            data_folder: Path to folder containing .npy segment files
            window_shape: (channels, samples)
            transform: Optional transform function
        """
        self.data_folder = data_folder
        self.window_shape = window_shape
        self.transform = transform

        # Index all segments individually, not per file
        self.index = []

        for file_name in os.listdir(data_folder):
            if file_name.endswith('.npy'):
                file_path = os.path.join(data_folder, file_name)
                # Get number of segments in this file
                segments = np.load(file_path, mmap_mode='r')  # mmap_mode --> do NOT fully load!
                num_segments = segments.shape[0]
                for i in range(num_segments):
                    self.index.append((file_path, i))  # Save (file_path, segment_idx)

    def __len__(self):
        return len(self.index)

    def __getitem__(self, idx):
        file_path, segment_idx = self.index[idx]
        # Load just the needed window
        segments = np.load(file_path, mmap_mode='r')
        window = segments[segment_idx]  # shape: (channels, samples)

        if self.transform:
            window = self.transform(window)

        window = torch.tensor(window, dtype=torch.float32)
        window = window.permute(1, 0)  # Now [samples, channels] for xLSTM

        return window

def create_lazy_eeg_dataloader(data_folder, window_shape, batch_size=32, shuffle=True, num_workers=4):
    dataset = EEGWindowLazyDataset(data_folder, window_shape)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
    return loader
