In [1]:
#Important functions you will need
def getLabelAndTimeStartAndEnd(filename):
    returnList = []
    with open(filename) as file:
        for line in file:
            if len(line.split()) == 4:
                returnList.append(line.split()[:-1])
    return returnList

def get_info_with_mne(file_path):
    """ read info from the edf file without loading the data. loading data is done in multiprocessing since it takes
    some time. getting info is done before because some files had corrupted headers or weird sampling frequencies
    that caused the multiprocessing workers to crash. therefore get and check e.g. sampling frequency and duration
    beforehand
    :param file_path: path of the recording file
    :return: file name, sampling frequency, number of samples, number of signals, signal names, duration of the rec
    """
    try:
        f = mne.io.read_raw_edf(file_path+".edf", verbose='error')
        labelList = getLabelAndTimeStartAndEnd(file_path+".tse")
    except ValueError:
        return None, None, None, None, None, None

    
    samplingFrequency = int(f.info['sfreq'])
    if samplingFrequency < 10:
        samplingFrequency = 1 / (f.times[1] - f.times[0])
        if samplingFrequency < 10:
            return None, sampling_frequency, None, None

    # edf_file, sampling_frequency, n_samples, signal_names
    # remember that duration = n_samples / sampling_frequency 
    return f, samplingFrequency, f.n_times, f.ch_names, labelList





In [57]:
class Metadata(object):

    def __init__(self):
        self.shape = None
        self.data_length_sec = None
        self.sampling_frequency = None
        self.channels = None
        self.sequences = []

    def add_shape(self, shape):
        if self.shape is None:
            self.shape = shape
        else:
            assert shape == self.shape

    def add_data_length_sec(self, data_length_sec):
        if self.data_length_sec is None:
            self.data_length_sec = data_length_sec
        else:
            assert data_length_sec == self.data_length_sec

    def add_sampling_frequency(self, sampling_frequency):
        if self.sampling_frequency is None:
            self.sampling_frequency = sampling_frequency
        else:
            assert sampling_frequency == self.sampling_frequency

    def add_channels(self, channels):
        if self.channels is None:
            self.channels = channels
        else:
            assert np.alltrue(channels == self.channels)

    def add_sequence(self, sequence):
        if sequence is not None:
            self.sequences.append(sequence)


In [None]:
class Windower:
    """
    Breaks the time-series data into N second segments, for example 60s windows
    will create 10 windows given a 600s segment. The output is the reshaped data
    e.g. (600, 120000) -> (600, 10, 12000)
    """
    def __init__(self, window_secs=None):
        self.window_secs = window_secs
        self.name = 'w-%ds' % window_secs if window_secs is not None else 'w-whole'

    def get_name(self):
        return self.name

    def apply(self, X, meta=None):
        if self.window_secs is None:
            return X.reshape([1] + list(X.shape))

        num_windows = meta.data_length_sec / self.window_secs
        samples_per_window = self.window_secs * int(meta.sampling_frequency)
        samples_used = num_windows * samples_per_window
        samples_dropped = X.shape[-1] - samples_used
        X = Slice(samples_dropped).apply(X)
        out = np.split(X, num_windows, axis=X.ndim-1)
        out = to_np_array(out)
        return out


In [None]:
def to_np_array(X):
    if isinstance(X[0], np.ndarray):
        # return np.vstack(X)
        out = np.empty([len(X)] + list(X[0].shape), dtype=X[0].dtype)
        for i, x in enumerate(X):
            out[i] = x
        return out

    return np.array(X)

In [None]:
class Slice:
    """
    Take a slice of the data on the last axis.
    e.g. Slice(1, 48) works like a normal python slice, that is 1-47 will be taken
    """
    def __init__(self, start, end=None):
        self.start = start
        self.end = end

    def get_name(self):
        return "slice%d%s" % (self.start, '-%d' % self.end if self.end is not None else '')

    def apply(self, data, meta=None):
        s = [slice(None),] * data.ndim
        s[-1] = slice(self.start, self.end)
        return data[s]

In [None]:
dataFromEdf[0]

In [2]:
%matplotlib qt
%matplotlib inline

#remember to do "conda activate mne" before launching the jupyter notebook
from functools import partial
import multiprocessing as mp
import numpy as np
import pandas as pd
import logging
import os
import mne

prePath = "/Users/tinaraissi/workspace/EEG/tuh-eeg-auto-diagnosis/"


In [3]:
def getDataframe(filename):
    #this function read an edf file and returns a dataframe (n_samples * n_channels)
    # having the time samples for each electrode in each column
    
    edfData =  mne.io.read_raw_edf(filename)
    dataFromEdf = edfData.get_data()
    dataset = pd.DataFrame(index=range(edfData.n_times), columns=edfData.ch_names)

    #At this point you have 
    for dataSample, channel in enumerate(edfData.ch_names):
        dataset[channel] = dataFromEdf[dataSample]
        
    return edfData, dataset        

In [4]:
#File formats in the relative folder
#*.edf:    the EEG sampled data in European Data Format (edf)
#*.txt:    the EEG report corresponding to the patient and session
#*.tse:    term-based annotations using all available seizure type classes
#*.tse_bi: same as *.tse except bi-class annotations (seizure/background) 
#*.lbl:    event-based annotations using all available seizure type classes
#*.lbl_bi: same as *.lbl except bi-class annotations (seizure/background)

#filename = prePath+"v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.edf"
filename = "v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.edf"
edfData, dataset =  getDataframe(filename)


Extracting EDF parameters from v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


In [5]:
labelFilename = "v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.tse"

In [6]:
def init_processing_units():
    self.cleaner = data_cleaner.DataCleaner(elecs=cmd_args.elecs)
    self.splitter = data_splitter.DataSplitter(window=cmd_args.window, window_size_sec=cmd_args.windowsize,
                                               overlap=cmd_args.overlap)
    self.feature_generator = feature_generator.FeatureGenerator(domain=cmd_args.domain, bands=cmd_args.bands,
                                                                window_size_sec=cmd_args.windowsize,
                                                                overlap=cmd_args.overlap, perrec=cmd_args.perrec,
                                                                electrodes=self.cleaner.get_electrodes())

In [7]:
filename.title()

'V1.4.0/Edf/Train/02_Tcp_Le/001/00000143/S001_2003_03_10/00000143_S001_T001.Edf'

['version', '=', 'tse_v1.0.0']
[]
['0.0000', '1279.0000', 'bckg', '1.0000']


In [7]:
rootdir = "v1.4.0/edf/train/02_tcp_le"
segLabelFilenames = {}


for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        p = os.path.join(subdir, file)
        if p.endswith("edf"):
            segLabelFilenames[p[56:-4]] = p.split(".edf")[0]
        

In [9]:
p = list(segLabelFilenames.keys())[100]

In [10]:
#edf_file, sampling_frequency, n_samples, n_signals, signal_names, duration
f = get_info_with_mne(segLabelFilenames[p])

In [11]:
f

(<RawEDF  |  00000002_s005_t001.edf, n_channels x n_times : 33 x 88000 (352.0 sec), ~67 kB, data not loaded>,
 250,
 88000,
 ['EEG FP1-LE',
  'EEG FP2-LE',
  'EEG F3-LE',
  'EEG F4-LE',
  'EEG C3-LE',
  'EEG C4-LE',
  'EEG A1-LE',
  'EEG A2-LE',
  'EEG P3-LE',
  'EEG P4-LE',
  'EEG O1-LE',
  'EEG O2-LE',
  'EEG F7-LE',
  'EEG F8-LE',
  'EEG T3-LE',
  'EEG T4-LE',
  'EEG T5-LE',
  'EEG T6-LE',
  'EEG FZ-LE',
  'EEG CZ-LE',
  'EEG PZ-LE',
  'EEG OZ-LE',
  'EEG PG1-LE',
  'EEG PG2-LE',
  'EEG EKG-LE',
  'EEG SP2-LE',
  'EEG SP1-LE',
  'EEG RLC-LE',
  'EEG LUC-LE',
  'EEG 30-LE',
  'EEG T1-LE',
  'EEG T2-LE',
  'STI 014'],
 [['0.0000', '352.0000', 'bckg']])

In [13]:
segLabelFilenames[p]
f = get_info_with_mne(segLabelFilenames[p])

In [16]:
f

(<RawEDF  |  00000002_s005_t001.edf, n_channels x n_times : 33 x 88000 (352.0 sec), ~67 kB, data not loaded>,
 250,
 88000,
 ['EEG FP1-LE',
  'EEG FP2-LE',
  'EEG F3-LE',
  'EEG F4-LE',
  'EEG C3-LE',
  'EEG C4-LE',
  'EEG A1-LE',
  'EEG A2-LE',
  'EEG P3-LE',
  'EEG P4-LE',
  'EEG O1-LE',
  'EEG O2-LE',
  'EEG F7-LE',
  'EEG F8-LE',
  'EEG T3-LE',
  'EEG T4-LE',
  'EEG T5-LE',
  'EEG T6-LE',
  'EEG FZ-LE',
  'EEG CZ-LE',
  'EEG PZ-LE',
  'EEG OZ-LE',
  'EEG PG1-LE',
  'EEG PG2-LE',
  'EEG EKG-LE',
  'EEG SP2-LE',
  'EEG SP1-LE',
  'EEG RLC-LE',
  'EEG LUC-LE',
  'EEG 30-LE',
  'EEG T1-LE',
  'EEG T2-LE',
  'STI 014'],
 [['0.0000', '352.0000', 'bckg']])

In [58]:
getLabelAndTimeStartAndEnd(segLabelFilenames[p]+".tse")

[['0.0000', '1273.4520', 'bckg'],
 ['1273.4520', '1319.1880', 'cpsz'],
 ['1319.1880', '1992.0000', 'bckg']]

In [17]:
88000/250

352.0