In [None]:
#Important functions you will need

def get_info_with_mne(file_path):
    """ read info from the edf file without loading the data. loading data is done in multiprocessing since it takes
    some time. getting info is done before because some files had corrupted headers or weird sampling frequencies
    that caused the multiprocessing workers to crash. therefore get and check e.g. sampling frequency and duration
    beforehand
    :param file_path: path of the recording file
    :return: file name, sampling frequency, number of samples, number of signals, signal names, duration of the rec
    """
    try:
        edf_file = mne.io.read_raw_edf(file_path, verbose='error')
    except ValueError:
        return None, None, None, None, None, None
        # fix_header(file_path)
        # try:
        #     edf_file = mne.io.read_raw_edf(file_path, verbose='error')
        #     logging.warning("Fixed it!")
        # except ValueError:
        #     return None, None, None, None, None, None

    # some recordings have a very weird sampling frequency. check twice before skipping the file
    sampling_frequency = int(edf_file.info['sfreq'])
    if sampling_frequency < 10:
        sampling_frequency = 1 / (edf_file.times[1] - edf_file.times[0])
        if sampling_frequency < 10:
            return None, sampling_frequency, None, None, None, None

    n_samples = edf_file.n_times
    signal_names = edf_file.ch_names
    n_signals = len(signal_names)
    # some weird sampling frequencies are at 1 hz or below, which results in division by zero
    duration = n_samples / max(sampling_frequency, 1)

    # TODO: return rec object?
    return edf_file, sampling_frequency, n_samples, n_signals, signal_names, duration

def preprocess(self, cmd_args):
    """ Checks if all the EEG recordings of the input directories can be processed. For every processable recording
    an entry in the multiprocessing queue is inserted. Results (features) are written to .hdf5 files per input dir.
    :param cmd_args:
    :return:
    """
    self.init_processing_units(cmd_args)

    # create output directory
    my_io.check_out(cmd_args.output, cmd_args.input)
    my_io.write_feature_labels(cmd_args.output, self.feature_generator.get_feature_labels())

    # set up multiprocessing
    manager = mp.Manager()
    in_q, out_q = manager.Queue(), manager.Queue()

    # stores for every class the hdf5 file name where features are stored
    feature_files = []
    for in_dir_id, in_dir in enumerate(cmd_args.input):
        self.check_path(in_dir)
        self.stats.n_classes += 1
        self.recording_names[in_dir] = list()

        edf_files = self.read_files(in_dir, cmd_args.subset)
        self.stats.n_recordings += len(edf_files)

        in_q, edf_count = self.add_recording_to_queue(in_dir, edf_files, in_q)

        self.spawn_start_join_processes(cmd_args, in_q, out_q)

        features = self.catch_results(in_dir_id, in_dir, out_q, edf_count)
        features = np.vstack(features)

        file_name = my_io.write_hdf5(features, in_dir, cmd_args)
        feature_files.append(file_name)

    # TODO: add this
    my_io.write_recording_names(cmd_args.output, cmd_args.input, self.recording_names)

    return feature_files, self.window_counts, self.stats

In [None]:
dataFromEdf[0]

In [1]:
%matplotlib qt
%matplotlib inline

#remember to do "conda activate mne" before launching the jupyter notebook
from functools import partial
import multiprocessing as mp
import numpy as np
import pandas as pd
import logging
import os
import mne

prePath = "/Users/tinaraissi/workspace/EEG/tuh-eeg-auto-diagnosis/"


In [2]:
def getDataframe(filename):
    #this function read an edf file and returns a dataframe (n_samples * n_channels)
    # having the time samples for each electrode in each column
    
    edfData =  mne.io.read_raw_edf(filename)
    dataFromEdf = edfData.get_data()
    dataset = pd.DataFrame(index=range(edfData.n_times), columns=edfData.ch_names)

    #At this point you have 
    for dataSample, channel in enumerate(edfData.ch_names):
        dataset[channel] = dataFromEdf[dataSample]
        
    return edfData, dataset        

In [3]:
#File formats in the relative folder
#*.edf:    the EEG sampled data in European Data Format (edf)
#*.txt:    the EEG report corresponding to the patient and session
#*.tse:    term-based annotations using all available seizure type classes
#*.tse_bi: same as *.tse except bi-class annotations (seizure/background) 
#*.lbl:    event-based annotations using all available seizure type classes
#*.lbl_bi: same as *.lbl except bi-class annotations (seizure/background)

filename = prePath+"v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.edf"
edfData, dataset =  getDataframe(filename)


Extracting EDF parameters from /Users/tinaraissi/workspace/EEG/tuh-eeg-auto-diagnosis/v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


In [8]:
labelFilename = prePath+"v1.4.0/edf/train/02_tcp_le/001/00000143/s001_2003_03_10/00000143_s001_t001.tse"

In [7]:
def init_processing_units():
    self.cleaner = data_cleaner.DataCleaner(elecs=cmd_args.elecs)
    self.splitter = data_splitter.DataSplitter(window=cmd_args.window, window_size_sec=cmd_args.windowsize,
                                               overlap=cmd_args.overlap)
    self.feature_generator = feature_generator.FeatureGenerator(domain=cmd_args.domain, bands=cmd_args.bands,
                                                                window_size_sec=cmd_args.windowsize,
                                                                overlap=cmd_args.overlap, perrec=cmd_args.perrec,
                                                                electrodes=self.cleaner.get_electrodes())

In [6]:
filename.title()

'/Users/Tinaraissi/Workspace/Eeg/Tuh-Eeg-Auto-Diagnosis/V1.4.0/Edf/Train/02_Tcp_Le/001/00000143/S001_2003_03_10/00000143_S001_T001.Edf'

In [11]:
with open(labelFilename) as file:
    for line in file:
        print(line.split())

['version', '=', 'tse_v1.0.0']
[]
['0.0000', '1279.0000', 'bckg', '1.0000']
