# Imports

In [1]:
import pyedflib
import numpy as np
from pylab import *
from scipy import signal
import pandas as pd
import seaborn
import glob, os
from collections import OrderedDict
from scipy.stats import *
import csv
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, Holt
import skimage

In [2]:
basedir = os.getcwd()
datasetdir = basedir + '\DatasetsSinExp'
# Put here the directory of the CHBMIT DATABASE
dbdir = r"F:\Master\TFM\chb-mit-scalp-eeg-database-1.0.0"

In [3]:
basedir

'C:\\Users\\Mario\\Documents\\Master\\TFM\\Codigo\\Jupiter\\EEG_ML\\DataSetCreation'

# Functions

### Reading functions

In [4]:
class Register:
    
    def __init__(self, name, fs, nseizures):
        self.name = name
        self.fs = fs
        self.nseizures = nseizures
        self.seizures = []
        self.channels = []
        self.ictaltime = 0
            
    def addSeizure (self, start, end):
        self.ictaltime += end - start
        seizure = [start, end]
        self.seizures.append(seizure)
        

def read_data(filename, channels=[]):
    f = pyedflib.EdfReader(filename)
    
    # if no channels are passed to the function
    if len(channels) == 0:
        channels = f.getSignalLabels()

    channel_names = f.getSignalLabels()
    fs = f.getSampleFrequencies()

    data = np.zeros((len(channels), f.getNSamples()[0]))
    for i, channel in enumerate(channels):
        data[i, :] = f.readSignal(channel_names.index(channel))
        
    time = np.linspace(0, data.shape[1]/fs[0], data.shape[1])
    f._close()
    return data, fs[0], time

def trunc(data, timeW, fs):
    samples = data.shape[1]
    timeW = 2
    N = timeW*fs
    nw = int(samples//N)

    data = data [:, 0:nw*N]
    time = np.linspace(0, data.shape[1]/fs, data.shape[1])
    return data, time, nw, N

### Processing functions

In [5]:
def frec2sample_range(fi, fo, fs, N):
    si = max(1,floor(fi*N/fs))
    so = ceil(fo*N/fs)
    return int(si), int(so)

def band_energy(fft, fs):
    N = len(fft)
    # Energia total
    et = sum(fft)
    dsi, dso = frec2sample_range(0.5, 4, fs, N)
    d = sum(fft[dsi:dso])
    tsi, tso = frec2sample_range(4, 7, fs, N)
    t = sum(fft[tsi:tso])
    asi, aso = frec2sample_range(7, 13, fs, N)
    a = sum(fft[asi:aso])
    bsi, bso = frec2sample_range(13, 30, fs, N)
    b = sum(fft[bsi:bso])
    gsi, gso = frec2sample_range(30, 50, fs, N)
    g = sum(fft[gsi:gso])
    return et, d, t, a, b, g

def psd(signal):
    X = fft(signal)
    Px = real(X*conj(X))
    return Px

def exponential_smooth(timeseries, alpha=0.3):
    model = SimpleExpSmoothing(timeseries)
    fit = model.fit(smoothing_level=alpha)
    return fit.fittedvalues

def spectral_centroid(nf, ps):
    return sum(nf * ps)

def variational_coeff(nf, ps, sc):
    return sum( (((nf - sc))**2) * ps) / sum(ps)

def spectral_skew(nf, ps, sc, vc):
    return sum( ((nf - sc)/vc)**3 * ps) / sum(ps)
    

def channel_processing(channel_matrix, fs):
    
        ninstances = channel_matrix.shape[0]
        et = zeros(ninstances)
        d = zeros(ninstances)
        t = zeros(ninstances)
        a = zeros(ninstances)
        b = zeros(ninstances)
        g = zeros(ninstances)

        meanv = zeros(ninstances)
        variancev = zeros(ninstances)
        skewnessv = zeros(ninstances)
        kurtosisv = zeros(ninstances)
        stdv = zeros(ninstances)
        zcrossingsv = zeros(ninstances)
        p2pv = zeros(ninstances)

        spectralCentroid = zeros(ninstances)
        variationalCoeff = zeros(ninstances)
        spectralSkew = zeros(ninstances)

        #df = pd.DataFrame()
        features = ['mean', 'variance', 'skewness', 'kurtosis', 'std', 'zero_crossings', 'peak2peak',
                    'total_energy', 'delta', 'theta', 'alpha', 'beta', 'gamma',
                    'spectral_centroid', 'variatonial_coeff', 'spectral_skew']

        for index, row in enumerate(channel_matrix):
            try:
                meanv[index] = mean(row)
                variancev[index] = var(row)
                skewnessv[index] = skew(row)
                kurtosisv[index] = kurtosis(row)
                stdv[index] = std(row)
                zcrossingsv[index] = len(np.where(np.diff(np.sign(row)))[0])
                p2pv[index] = max(row)-min(row)

            except ZeroDivisionError:
                meanv[index] = 0.001
                variancev[index] = 0.001
                skewnessv[index] = 0.001
                kurtosisv[index] = 0.001
                stdv[index] = 0.001
                zcrossingsv[index] = 0.001
                p2pv[index] = 0.001

            Px = psd(row)
            et[index], d[index], t[index], a[index], b[index], g[index] = band_energy(Px, fs)

            p_spectrum = Px[:len(Px)//2]/sum(Px[:len(Px)//2])
            normalized_f = linspace(0, 1, len(Px)//2)
            spectralCentroid[index] = spectral_centroid(normalized_f, p_spectrum)
            variationalCoeff[index] = variational_coeff(normalized_f, p_spectrum, spectralCentroid[index])
            spectralSkew[index] = spectral_skew(normalized_f, p_spectrum, spectralCentroid[index], variationalCoeff[index])


        data = [(meanv),(variancev),   (skewnessv),
                  (kurtosisv),   (stdv),  (zcrossingsv),  (p2pv),
                  (et),   (d),   (t),
                  (a),   (b),  (g),
                  (spectralCentroid),  (variationalCoeff),   (spectralSkew)
        ]

        data = np.array(data).transpose()
        df = pd.DataFrame(data, columns = features)
        
        return df

In [6]:
def read_annotations(annotation):
    with open(annotation) as f:
        registers = {}
        channels_dict = {}
        nmontages = 1
        for line in f:
            if ("Data Sampling Rate" in line):
                line = line.split()
                fs = int(line[3]) 
                
            if ("Channel " in line):
                line = line.split()
                channel = line[2]
                if channel in channels_dict:
                    channels_dict.update({channel: channels_dict[channel]+1})
                else:
                    channels_dict[channel] = 1

            if ("Channels changed" in line):
                nmontages += 1
                
            elif ("File Name" in line):
                name = line.split()[2]
                while True:
                    newLine = f.readline()
                    if ("Number of Seizures" in newLine):
                        nseizures = int(newLine.split()[5])
                        register = Register(name, fs, nseizures)
                        if nseizures > 0:
                            for i in range(nseizures):
                                line1 = f.readline().split()
                                line2 = f.readline().split()
                                if (line1[3] == "Time:"):
                                    start = int(line1[4])
                                    end = int(line2[4])
                                else:
                                    start = int(line1[3])
                                    end = int(line2[3])
                                    
                                register.addSeizure(start, end)

                        registers[name] = register
                        break
    common_channels = []
    [common_channels.append(key) for key in channels_dict.keys() if channels_dict[key] == nmontages]
    channel_index = dict(zip( list(np.arange(len(common_channels))), common_channels ))
    return registers, channel_index

# Create Dataset for only one patient

In [7]:
patient = 'chb01'

In [8]:
f = open(dbdir + '\RECORDS-WITH-SEIZURES', 'r', encoding = 'utf-8')
seizure_files = f.read().split('\n')
seizure_files = list(map(lambda string: string[6:], seizure_files))
f.close()

fdir = dbdir + '\\' + patient

In [18]:
os.chdir(fdir)
annotation = glob.glob('*txt')

registers, channel_index = read_annotations(annotation[0])
total_ictaltime = 0
for key, value in registers.items():
    total_ictaltime += value.ictaltime
channel_index

{0: 'FP1-F7',
 1: 'F7-T7',
 2: 'T7-P7',
 3: 'P7-O1',
 4: 'FP1-F3',
 5: 'F3-C3',
 6: 'C3-P3',
 7: 'P3-O1',
 8: 'FP2-F4',
 9: 'F4-C4',
 10: 'C4-P4',
 11: 'P4-O2',
 12: 'FP2-F8',
 13: 'F8-T8',
 14: 'P8-O2',
 15: 'FZ-CZ',
 16: 'CZ-PZ',
 17: 'P7-T7',
 18: 'T7-FT9',
 19: 'FT9-FT10',
 20: 'FT10-T8'}

In [15]:
patient_seizure_files = []
patient_non_seizure_files = []

patient_files = sorted(registers.keys())
for file in patient_files:
    if file in seizure_files:
        patient_seizure_files.append(file)
    else:
        patient_non_seizure_files.append(file)
        
patient_non_seizure_files = patient_non_seizure_files[-min(4, len(patient_seizure_files)):]
print(patient_seizure_files)
print(patient_non_seizure_files)

['chb01_03.edf', 'chb01_04.edf', 'chb01_15.edf', 'chb01_16.edf', 'chb01_18.edf', 'chb01_21.edf', 'chb01_26.edf']
['chb01_41.edf', 'chb01_42.edf', 'chb01_43.edf', 'chb01_46.edf']


In [16]:
# Select window duration in seconds
timeW = 2
nchannels = len(channel_index)
decimationCoeff = 2
fs = registers['chb01_01.edf'].fs

selected_channels_lof = []

dataframe = pd.DataFrame()
for key, value in registers.items():
    
    patient_seizure_files = []
    patient_non_seizure_files = []
    patient_files = sorted(registers.keys())
    for file in patient_files:
        if file in seizure_files:
            patient_seizure_files.append(file)
        else:
            patient_non_seizure_files.append(file)
    patient_non_seizure_files = patient_non_seizure_files[-min(4, len(patient_seizure_files)):]

    # Signal reading: only if is a seizure file
    if key in patient_seizure_files:
        signals, originalfs, time = read_data(key, channel_index.values())
        # Decimation
        signals = signal.decimate(signals, decimationCoeff)
        fs = originalfs//decimationCoeff

        # Truncate to generate time windows
        signals_trunc, time, nw, N = trunc(signals, timeW, fs)
        samples = signals_trunc.shape[1]

        print("Readed " + key)

        # Seizure vector creation
        seizure = zeros(samples)
        for n in range (len(value.seizures)):
            start = value.seizures[n][0]*fs
            end = value.seizures[n][1]*fs
            seizure[start:end] = np.ones(end-start)

        seizureW = np.reshape(seizure, [nw, N])
        seizureW = (sum(seizureW, 1) > N//2)

        # Create register dataframe
        auxdf = pd.DataFrame()
        for channel, s in enumerate(signals_trunc):
            newSignal = np.reshape(s, [nw, N])
            newdf = channel_processing(newSignal, fs)
            newdf['channel'] = pd.Series( [channel_index[channel]]*nw, index = newdf.index)
            newdf['seizure'] = pd.Series( seizureW, index = newdf.index)
            # Get only windows that has seizures
            newdf = newdf[newdf['seizure'] == True]
            auxdf = auxdf.append(newdf, ignore_index=True)

        # Add to the patient dataframe
        dataframe = dataframe.append(auxdf, ignore_index=True)
        print("Rows created for " + key)
        print(sum(seizureW)*2)
        print(value.ictaltime)
        
    elif key in patient_non_seizure_files:
        signals, originalfs, time = read_data(key, channel_index.values())
        # Decimation
        signals = signal.decimate(signals, decimationCoeff)
        fs = originalfs//decimationCoeff

        # Truncate to generate time windows
        signals_trunc, time, nw, N = trunc(signals, timeW, fs)
        samples = signals_trunc.shape[1]
        
        print("Readed " + key)

        # Create register dataframe
        auxdf = pd.DataFrame()
        for channel, s in enumerate(signals_trunc):
            newSignal = np.reshape(s, [nw, N])
            newdf = channel_processing(newSignal, fs)
            newdf['channel'] = pd.Series( [channel_index[channel]]*nw, index = newdf.index)
            newdf['seizure'] = pd.Series( [False]*nw, index = newdf.index)
            auxdf = auxdf.append(newdf, ignore_index=True)

        # Add to the patient dataframe
        dataframe = dataframe.append(auxdf, ignore_index=True)
        print("Rows created for " + key)
        
# Save the datase and the csv with the list of significant channels
dataframe.to_hdf(datasetdir + '\\' + patient + 'features' + '.h5', key = 'fullpatient', mode = 'w', format = 'table')
        

os.chdir(basedir)

Readed chb01_03.edf
Rows created for chb01_03.edf
40
40
Readed chb01_04.edf
Rows created for chb01_04.edf
26
27
Readed chb01_15.edf
Rows created for chb01_15.edf
40
40
Readed chb01_16.edf
Rows created for chb01_16.edf
50
51
Readed chb01_18.edf
Rows created for chb01_18.edf
90
90
Readed chb01_21.edf
Rows created for chb01_21.edf
92
93
Readed chb01_26.edf
Rows created for chb01_26.edf
100
101
Readed chb01_41.edf
Rows created for chb01_41.edf
Readed chb01_42.edf
Rows created for chb01_42.edf
Readed chb01_43.edf
Rows created for chb01_43.edf
Readed chb01_46.edf
Rows created for chb01_46.edf


In [19]:
dataframe = dataframe[(dataframe['channel'] == 'FT9-FT10')]
dataframe = dataframe.drop(['channel'], axis=1)
dataframe[dataframe['seizure'] == True]

Unnamed: 0,mean,variance,skewness,kurtosis,std,zero_crossings,peak2peak,total_energy,delta,theta,alpha,beta,gamma,spectral_centroid,variatonial_coeff,spectral_skew,seizure
380,4.087816,3553.907973,-0.029482,-0.784295,59.614662,30.0,257.535498,2.340040e+08,4.649100e+07,3.194974e+07,3.335167e+07,2.775248e+06,1.867278e+06,0.084716,0.006953,5633.124712,True
381,0.848954,2935.146781,0.616809,0.473512,54.176995,33.0,286.558729,1.924050e+08,3.035127e+07,3.069691e+07,2.947087e+07,3.514183e+06,1.982214e+06,0.100330,0.009604,4110.530506,True
382,-25.566080,23250.518716,0.025605,-0.153879,152.481208,13.0,768.926597,1.566582e+09,6.664790e+08,6.244330e+07,2.074918e+07,7.247536e+06,4.681884e+06,0.049473,0.003095,43085.191820,True
383,3.706693,44952.322596,-0.366518,-0.638484,212.019628,14.0,955.262353,2.946896e+09,1.200923e+09,2.287000e+08,2.240824e+07,9.519925e+06,1.111790e+07,0.052628,0.003189,45515.817222,True
384,16.203078,27100.015141,-0.596715,-0.605445,164.620822,13.0,661.123758,1.793232e+09,8.153510e+08,4.545274e+07,1.572546e+07,5.153102e+06,5.952443e+06,0.044030,0.003157,48832.024307,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4544,-14.269116,19167.747678,0.667307,0.872312,138.447635,12.0,801.279892,1.269521e+09,5.242404e+08,6.084140e+07,2.570570e+07,1.523179e+07,2.019875e+06,0.048501,0.002943,34675.742519,True
4545,3.256687,31583.261427,0.224437,-0.782700,177.716801,13.0,769.493773,2.070536e+09,8.036348e+08,1.472106e+08,4.898865e+07,3.063146e+07,3.572112e+06,0.054147,0.004269,18878.187783,True
4546,14.799941,41488.995972,-0.442396,-0.409831,203.688478,16.0,917.490605,2.733378e+09,7.794925e+08,3.855894e+08,1.606006e+08,2.903363e+07,4.173103e+06,0.064038,0.003445,24183.885918,True
4547,8.459199,18604.867394,0.484228,0.342665,136.399661,25.0,716.429684,1.223978e+09,3.076724e+08,2.107554e+08,5.121021e+07,3.744613e+07,2.521936e+06,0.076327,0.005104,9206.006004,True


# Dataset creation for all the patients

In [86]:
os.chdir(dbdir)
patients = [name for name in os.listdir(".") if os.path.isdir(name)]
patients = patients[11:15]
patients

['chb13', 'chb14', 'chb15', 'chb16']

In [87]:
f = open(dbdir + '\RECORDS-WITH-SEIZURES', 'r', encoding = 'utf-8')
seizure_files = f.read().split('\n')
seizure_files = list(map(lambda string: string[6:], seizure_files))
f.close()

In [88]:
timeW = 2
decimationCoeff = 2

for patient in patients:
    print('---------------------------------------------- Patient: ' + patient + ' ----------------------------------------------------')
    fdir = dbdir + '\\' + patient
    os.chdir(fdir)
    annotation = glob.glob('*txt')
    
    registers, channel_index = read_annotations(annotation[0])

    nchannels = len(channel_index)
    selected_channels_lof = []

    
    dataframe = pd.DataFrame()
    for key, value in registers.items():

        patient_seizure_files = []
        patient_non_seizure_files = []
        patient_files = sorted(registers.keys())
        for file in patient_files:
            if file in seizure_files:
                patient_seizure_files.append(file)
            else:
                patient_non_seizure_files.append(file)
        patient_non_seizure_files = patient_non_seizure_files[-min(4, len(patient_seizure_files)):]

        # Signal reading: only if is a seizure file
        if key in patient_seizure_files:
            signals, originalfs, time = read_data(key, channel_index.values())
            # Decimation
            signals = signal.decimate(signals, decimationCoeff)
            fs = originalfs//decimationCoeff

            # Truncate to generate time windows
            signals_trunc, time, nw, N = trunc(signals, timeW, fs)
            samples = signals_trunc.shape[1]

            print("Readed " + key)

            # Seizure vector creation
            seizure = zeros(samples)
            for n in range (len(value.seizures)):
                start = value.seizures[n][0]*fs
                end = value.seizures[n][1]*fs
                seizure[start:end] = np.ones(end-start)

            seizureW = np.reshape(seizure, [nw, N])
            seizureW = (sum(seizureW, 1) > N//2)

            # Create register dataframe
            auxdf = pd.DataFrame()
            for channel, s in enumerate(signals_trunc):
                newSignal = np.reshape(s, [nw, N])
                newdf = channel_processing(newSignal, fs)
                newdf['channel'] = pd.Series( [channel_index[channel]]*nw, index = newdf.index)
                newdf['seizure'] = pd.Series( seizureW, index = newdf.index)
                # Get only windows that has seizures
                newdf = newdf[newdf['seizure'] == True]
                auxdf = auxdf.append(newdf, ignore_index=True)

            # Add to the patient dataframe
            dataframe = dataframe.append(auxdf, ignore_index=True)
            print(dataframe.shape)
            print("Rows created for " + key)

        elif key in patient_non_seizure_files:
            signals, originalfs, time = read_data(key, channel_index.values())
            # Decimation
            signals = signal.decimate(signals, decimationCoeff)
            fs = originalfs//decimationCoeff

            # Truncate to generate time windows
            signals_trunc, time, nw, N = trunc(signals, timeW, fs)
            samples = signals_trunc.shape[1]

            print("Readed " + key)

            # Create register dataframe
            auxdf = pd.DataFrame()
            for channel, s in enumerate(signals_trunc):
                newSignal = np.reshape(s, [nw, N])
                newdf = channel_processing(newSignal, fs)
                newdf['channel'] = pd.Series( [channel_index[channel]]*nw, index = newdf.index)
                newdf['seizure'] = pd.Series( [False]*nw, index = newdf.index)
                auxdf = auxdf.append(newdf, ignore_index=True)

            # Add to the patient dataframe
            dataframe = dataframe.append(auxdf, ignore_index=True)
            print("Rows created for " + key)

    # Save the datase and the csv with the list of significant channels
    dataframe.to_hdf(datasetdir + '\\' + patient + 'features' + '.h5', key = 'fullpatient', mode = 'w', format = 'table')
    

---------------------------------------------- Patient: chb13 ----------------------------------------------------
Readed chb13_19.edf
(357, 18)
Rows created for chb13_19.edf
Readed chb13_40.edf
(1156, 18)
Rows created for chb13_40.edf
Readed chb13_47.edf
Rows created for chb13_47.edf
Readed chb13_55.edf
(32079, 18)
Rows created for chb13_55.edf
Readed chb13_56.edf
Rows created for chb13_56.edf
Readed chb13_58.edf
(62815, 18)
Rows created for chb13_58.edf
Readed chb13_59.edf
Rows created for chb13_59.edf
Readed chb13_60.edf
(93602, 18)
Rows created for chb13_60.edf
Readed chb13_62.edf
Rows created for chb13_62.edf
---------------------------------------------- Patient: chb14 ----------------------------------------------------
Readed chb14_03.edf
(147, 18)
Rows created for chb14_03.edf
Readed chb14_06.edf
(273, 18)
Rows created for chb14_06.edf
Readed chb14_11.edf
(693, 18)
Rows created for chb14_11.edf
Readed chb14_17.edf
(882, 18)
Rows created for chb14_17.edf
Readed chb14_27.edf
(10