# Imports

In [1]:
import pyedflib
import numpy as np
from pylab import *
from scipy import signal
import pandas as pd
import seaborn
import glob, os
from collections import OrderedDict
from scipy.stats import *
import csv
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, Holt
import skimage

In [2]:
basedir = os.getcwd()
# Put here the directory of the CHBMIT DATABASE
dbdir = r"F:\Master\TFM\chb-mit-scalp-eeg-database-1.0.0"

# Functions

### Reading functions

In [3]:
class Register:
    
    def __init__(self, name, fs, nseizures):
        self.name = name
        self.fs = fs
        self.nseizures = nseizures
        self.seizures = []
        self.channels = []
            
    def addSeizure (self, start, end):
        seizure = [start, end]
        self.seizures.append(seizure)
        
    def addCh (self, channels):
        self.channels = channels

def read_data(filename, channels=[]):
    f = pyedflib.EdfReader(filename)
    if len(channels) == 0:
        channels = f.getSignalLabels()

    channel_names = f.getSignalLabels()
    fs = f.getSampleFrequencies()

    data = np.zeros((len(channels), f.getNSamples()[0]))
    for i, channel in enumerate(channels):
        data[i, :] = f.readSignal(channel_names.index(channel))
        
    time = np.linspace(0, data.shape[1]/fs[0], data.shape[1])
    f._close()
    return data, fs[0], time

def trunc(data, timeW, fs):
    samples = data.shape[1]
    timeW = 2
    N = timeW*fs
    nw = int(samples//N)

    data = data [:, 0:nw*N]
    time = np.linspace(0, data.shape[1]/fs, data.shape[1])
    return data, time, nw, N

def calc_variances_entropys(signals, nw, N):
    variances = np.zeros([signals.shape[0], nw])
    entropys = np.zeros([signals.shape[0], nw])
    for channel, signal in enumerate(signals):
        signal = np.reshape(signal, [nw, N])
        variances[channel, :] = np.var(signal, 1)
        for n, s in enumerate(signal):
            pd_series = pd.Series(s)
            counts = pd_series.value_counts()
            entropys[channel, n] = entropy(counts, base = 2)
    return variances, entropys



def select_best_signals(signals_trunc, nw, N, seizureW, nchannels, channel_index):
    variances, entropys = calc_variances_entropys(signals_trunc, nw, N)
    
    ictal_index = where(seizureW == 1)
    
    product_var_ent = float32(variances[:, ictal_index[0][0]:ictal_index[0][-1]+1] * entropys[:, ictal_index[0][0]:ictal_index[0][-1]+1])
    mean_product = mean(product_var_ent, 1)
    
    dictionary = dict(zip(channel_index.keys(), mean_product))
    sorted_dict = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}
    
    # Extract first 4 and create the valid signals
    significant_channels = list(sorted_dict.keys())[0:nchannels]
    significant_signals = zeros([len(significant_channels), signals_trunc.shape[1]])
    for i, key in enumerate(significant_channels):
        significant_signals[i, :] = signals_trunc[channel_index[key], :]
        
    return significant_signals, significant_channels

### Processing functions

In [4]:
def frec2sample_range(fi, fo, fs, N):
    si = max(1,floor(fi*N/fs))
    so = ceil(fo*N/fs)
    return int(si), int(so)

def band_energy(fft, fs):
    N = len(fft)
    # Energia total
    et = sum(fft)
    dsi, dso = frec2sample_range(0.5, 4, fs, N)
    d = sum(fft[dsi:dso])
    tsi, tso = frec2sample_range(4, 7, fs, N)
    t = sum(fft[tsi:tso])
    asi, aso = frec2sample_range(7, 13, fs, N)
    a = sum(fft[asi:aso])
    bsi, bso = frec2sample_range(13, 30, fs, N)
    b = sum(fft[bsi:bso])
    gsi, gso = frec2sample_range(30, 50, fs, N)
    g = sum(fft[gsi:gso])
    return et, d, t, a, b, g

def psd(signal):
    X = fft(signal)
    Px = real(X*conj(X))
    return Px

def exponential_smooth(timeseries, alpha=0.3):
    model = SimpleExpSmoothing(timeseries)
    fit = model.fit(smoothing_level=alpha)
    return fit.fittedvalues

def spectral_centroid(nf, ps):
    return sum(nf * ps)

def variational_coeff(nf, ps, sc):
    return sum( (((nf - sc))**2) * ps) / sum(ps)

def spectral_skew(nf, ps, sc, vc):
    return sum( ((nf - sc)/vc)**3 * ps) / sum(ps)
    

def channel_processing(channel_matrix, fs):
    
        ninstances = channel_matrix.shape[0]
        et = zeros(ninstances)
        d = zeros(ninstances)
        t = zeros(ninstances)
        a = zeros(ninstances)
        b = zeros(ninstances)
        g = zeros(ninstances)

        meanv = zeros(ninstances)
        variancev = zeros(ninstances)
        skewnessv = zeros(ninstances)
        kurtosisv = zeros(ninstances)
        stdv = zeros(ninstances)
        zcrossingsv = zeros(ninstances)
        p2pv = zeros(ninstances)

        spectralCentroid = zeros(ninstances)
        variationalCoeff = zeros(ninstances)
        spectralSkew = zeros(ninstances)

        #df = pd.DataFrame()
        features = ['mean', 'variance', 'skewness', 'kurtosis', 'std', 'zero_crossings', 'peak2peak',
                    'total_energy', 'delta', 'theta', 'alpha', 'beta', 'gamma',
                    'spectral_centroid', 'variatonial_coeff', 'spectral_skew']

        for index, row in enumerate(channel_matrix):
            try:
                meanv[index] = mean(row)
                variancev[index] = var(row)
                skewnessv[index] = skew(row)
                kurtosisv[index] = kurtosis(row)
                stdv[index] = std(row)
                zcrossingsv[index] = len(np.where(np.diff(np.sign(row)))[0])
                p2pv[index] = max(row)-min(row)

            except ZeroDivisionError:
                meanv[index] = 0.001
                variancev[index] = 0.001
                skewnessv[index] = 0.001
                kurtosisv[index] = 0.001
                stdv[index] = 0.001
                zcrossingsv[index] = 0.001
                p2pv[index] = 0.001

            Px = psd(row)
            et[index], d[index], t[index], a[index], b[index], g[index] = band_energy(Px, fs)

            p_spectrum = Px[:len(Px)//2]/sum(Px[:len(Px)//2])
            normalized_f = linspace(0, 1, len(Px)//2)
            spectralCentroid[index] = spectral_centroid(normalized_f, p_spectrum)
            variationalCoeff[index] = variational_coeff(normalized_f, p_spectrum, spectralCentroid[index])
            spectralSkew[index] = spectral_skew(normalized_f, p_spectrum, spectralCentroid[index], variationalCoeff[index])


        data = [exponential_smooth( (meanv)), exponential_smooth( (variancev)), exponential_smooth( (skewnessv)),
                exponential_smooth( (kurtosisv)), exponential_smooth( (stdv)),exponential_smooth( (zcrossingsv)),exponential_smooth( (p2pv)),
                exponential_smooth( (et)), exponential_smooth( (d)), exponential_smooth( (t)),
                exponential_smooth( (a)), exponential_smooth( (b)),exponential_smooth( (g)),
                exponential_smooth( (spectralCentroid)),exponential_smooth( (variationalCoeff)), exponential_smooth( (spectralSkew))
        ]

        data = np.array(data).transpose()
        df = pd.DataFrame(data, columns = features)
        
        return df

In [5]:
N = 4
A = np.array(range(32))
newA = skimage.util.view_as_windows(A, N, N//2)
newA

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13],
       [12, 13, 14, 15],
       [14, 15, 16, 17],
       [16, 17, 18, 19],
       [18, 19, 20, 21],
       [20, 21, 22, 23],
       [22, 23, 24, 25],
       [24, 25, 26, 27],
       [26, 27, 28, 29],
       [28, 29, 30, 31]])

In [6]:
newnewA = np.reshape(A, [8, 4])
newnewA

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [7]:
def read_annotations(annotation):
    with open(annotation) as f:
        registers = {}
        channels = []
        for line in f:
            if ("Data Sampling Rate" in line):
                line = line.split()
                fs = int(line[3])            
            if ("Channel " in line):
                line = line.split()
                channels.append(line[2])
            if ("Channels changed" in line):
                channels = []
            elif ("File Name" in line):
                name = line.split()[2]
                while True:
                    newLine = f.readline()
                    if ("Number of Seizures" in newLine):
                        nseizures = int(newLine.split()[5])
                        register = Register(name, fs, nseizures)
                        if nseizures > 0:
                            for i in range(nseizures):
                                line1 = f.readline().split()
                                line2 = f.readline().split()
                                if (line1[3] == "Time:"):
                                    start = int(line1[4])
                                    end = int(line2[4])
                                else:
                                    start = int(line1[3])
                                    end = int(line2[3])
                                register.addSeizure(start, end)

                        register.addCh(list(channels))
                        registers[name] = register
                        break

    channel_index = dict(zip(register.channels, list(np.arange(len(register.channels)))))
    return registers, channel_index

# Create Dataset for only one patient

In [8]:
patient = 'chb08'

In [9]:
f = open(dbdir + '\RECORDS-WITH-SEIZURES', 'r', encoding = 'utf-8')
seizure_files = f.read().split('\n')
seizure_files = list(map(lambda string: string[6:], seizure_files))
f.close()

fdir = dbdir + '\\' + patient

In [10]:
os.chdir(fdir)
annotation = glob.glob('*txt')

registers, channel_index = read_annotations(annotation[0])

In [11]:
channel_index

{'FP1-F7': 0,
 'F7-T7': 1,
 'T7-P7': 2,
 'P7-O1': 3,
 'FP1-F3': 4,
 'F3-C3': 5,
 'C3-P3': 6,
 'P3-O1': 7,
 'FP2-F4': 8,
 'F4-C4': 9,
 'C4-P4': 10,
 'P4-O2': 11,
 'FP2-F8': 12,
 'F8-T8': 13,
 'T8-P8': 22,
 'P8-O2': 15,
 'FZ-CZ': 16,
 'CZ-PZ': 17,
 'P7-T7': 18,
 'T7-FT9': 19,
 'FT9-FT10': 20,
 'FT10-T8': 21}

In [14]:
# Select window duration in seconds
timeW = 2
overlapping = 0.5
nchannels = len(channel_index)
decimationCoeff = 2
fs = registers['chb08_02.edf'].fs

selected_channels_lof = []

dataframe = pd.DataFrame()
for key, value in registers.items():
    
    # Signal reading: only if is a seizure file
    if key in seizure_files:
        
        # Signal reading
        signals, fs, time = read_data(key, value.channels)
        # Decimation
        signals = signal.decimate(signals, decimationCoeff)
        fs = fs//decimationCoeff

        # Truncate to generate time windows
        signals_trunc, time, nw, N = trunc(signals, timeW, fs)
        nw = int(nw/overlapping)
        samples = signals_trunc.shape[1]

        print("Readed " + key)

        # Seizure vector creation
        seizure = zeros(samples)
        if (len(value.seizures) > 0):
            for n in range (len(value.seizures)):
                start = value.seizures[n][0]*fs
                end = value.seizures[n][1]*fs
                seizure[start:end] = np.ones(end-start)

            newSignal = skimage.util.view_as_windows(s, N, N//2)
            seizureW = (sum(seizureW, 1) > N//2)

            selected_signals, selected_channels = select_best_signals(signals_trunc, nw, N, seizureW, nchannels, channel_index)
            selected_channels_lof.append(selected_channels)

            # Create list for feature names
            features = []
            for i in range(N):
                features.append("sample" + str(i+1))

            # Create register dataframe
            auxdf = pd.DataFrame()
            for channel, s in enumerate(selected_signals):
                print(s.shape)
                newSignal = skimage.util.view_as_windows(s, N, N//2)
                newdf = channel_processing(newSignal, fs)
                newdf['channel'] = pd.Series( [selected_channels[channel]]*nw, index = newdf.index)
                newdf['seizure'] = pd.Series( seizureW, index = newdf.index)
                auxdf = auxdf.append(newdf, ignore_index=True)
                print("Rows created for " + selected_channels[channel] + " channel")

            # Add to the patient dataframe
            dataframe = dataframe.append(auxdf, ignore_index=True)
            print("Rows created for " + key)

os.chdir(basedir)

Readed chb08_02.edf


NameError: name 's' is not defined

In [17]:
# Save the datase and the csv with the list of significant channels
datasetdir = basedir + '\Datasets'
dataframe.to_hdf(datasetdir + '\\' + patient + 'features' + '.h5', key = 'fullpatient', mode = 'w', format = 'table')

# Create the csv file where the significance order of the selected channels is going to be stored
os.chdir(datasetdir)
f = open(patient + '_channel_order.csv', 'w+')
writer=csv.writer(f)
writer.writerow(list(range(nchannels)))
for item in selected_channels_lof:
    writer.writerow(item)
f.close()
os.chdir(basedir)

# Dataset creation for all the patients

In [6]:
basedir = os.getcwd()
datasetdir = basedir + '\Datasets'

# Put here the directory of the CHBMIT DATABASE
dbdir = r"F:\Master\TFM\chb-mit-scalp-eeg-database-1.0.0"


In [7]:
os.chdir(dbdir)
patients = [name for name in os.listdir(".") if os.path.isdir(name)]
patients[:10]

['chb01',
 'chb02',
 'chb03',
 'chb05',
 'chb06',
 'chb07',
 'chb08',
 'chb09',
 'chb10',
 'chb11']

In [8]:
f = open(dbdir + '\RECORDS-WITH-SEIZURES', 'r', encoding = 'utf-8')
seizure_files = f.read().split('\n')
seizure_files = list(map(lambda string: string[6:], seizure_files))
f.close()

In [9]:
timeW = 2
overlapping = 0.5
decimationCoeff = 2

for patient in patients:
    print('---------------------------------------------- Patient: ' + patient + ' ----------------------------------------------------')
    fdir = dbdir + '\\' + patient
    os.chdir(fdir)
    annotation = glob.glob('*txt')
    
    registers, channel_index = read_annotations(annotation[0])

    nchannels = len(channel_index)
    selected_channels_lof = []

    dataframe = pd.DataFrame()
    for key, value in registers.items():

        # Signal reading: only if is a seizure file
        if key in seizure_files:
            signals, originalfs, time = read_data(key, value.channels)
            # Decimation
            signals = signal.decimate(signals, decimationCoeff)
            fs = originalfs//decimationCoeff

            # Truncate to generate time windows
            signals_trunc, time, nw, N = trunc(signals, timeW, fs)
            nw = int(nw/overlapping)
            samples = signals_trunc.shape[1]

            print("Readed " + key)

            # Seizure vector creation
            seizure = zeros(samples)


            for n in range (len(value.seizures)):
                start = value.seizures[n][0]*fs
                end = value.seizures[n][1]*fs
                seizure[start:end] = np.ones(end-start)

            seizureW = np.reshape(seizure, [nw, N])
            seizureW = (sum(seizureW, 1) > N//2)

            selected_signals, selected_channels = select_best_signals(signals_trunc, nw, N, seizureW, nchannels, channel_index)
            selected_channels_lof.append(selected_channels)

            # Create register dataframe
            auxdf = pd.DataFrame()
            for channel, s in enumerate(selected_signals):
                newSignal = skimage.util.view_as_windows(s, N, N//2)
                newdf = channel_processing(newSignal, fs)
                newdf['channel'] = pd.Series( [selected_channels[channel]]*nw, index = newdf.index)
                newdf['seizure'] = pd.Series( seizureW, index = newdf.index)
                auxdf = auxdf.append(newdf, ignore_index=True)

            # Add to the patient dataframe
            dataframe = dataframe.append(auxdf, ignore_index=True)
            print("Rows created for " + key)
            
            # Save the datase and the csv with the list of significant channels
            dataframe.to_hdf(datasetdir + '\\' + patient + 'features' + '.h5', key = 'fullpatient', mode = 'w', format = 'table')

            # Create the csv file where the significance order of the selected channels is going to be stored
            os.chdir(datasetdir)
            f = open(patient + '_channel_order.csv', 'w+')
            writer=csv.writer(f)
            writer.writerow(list(range(nchannels)))
            for item in selected_channels_lof:
                writer.writerow(item)
            f.close()
            os.chdir(fdir)
    

---------------------------------------------- Patient: chb01 ----------------------------------------------------
Readed chb01_03.edf


  loc = initial_p >= ub


Rows created for chb01_03.edf
Readed chb01_04.edf


  loc = initial_p >= ub


Rows created for chb01_04.edf
Readed chb01_15.edf




Rows created for chb01_15.edf
Readed chb01_16.edf
Rows created for chb01_16.edf
Readed chb01_18.edf
Rows created for chb01_18.edf
Readed chb01_21.edf
Rows created for chb01_21.edf
Readed chb01_26.edf
Rows created for chb01_26.edf
---------------------------------------------- Patient: chb02 ----------------------------------------------------
Readed chb02_16.edf
Rows created for chb02_16.edf
Readed chb02_19.edf
Rows created for chb02_19.edf
---------------------------------------------- Patient: chb03 ----------------------------------------------------
Readed chb03_01.edf
Rows created for chb03_01.edf
Readed chb03_02.edf
Rows created for chb03_02.edf
Readed chb03_03.edf
Rows created for chb03_03.edf
Readed chb03_04.edf
Rows created for chb03_04.edf
Readed chb03_34.edf
Rows created for chb03_34.edf
Readed chb03_35.edf
Rows created for chb03_35.edf
Readed chb03_36.edf
Rows created for chb03_36.edf
---------------------------------------------- Patient: chb05 ----------------------------



Rows created for chb07_19.edf
---------------------------------------------- Patient: chb08 ----------------------------------------------------
Readed chb08_02.edf
Rows created for chb08_02.edf
Readed chb08_05.edf
Rows created for chb08_05.edf
Readed chb08_11.edf
Rows created for chb08_11.edf
Readed chb08_13.edf
Rows created for chb08_13.edf
Readed chb08_21.edf
Rows created for chb08_21.edf
---------------------------------------------- Patient: chb09 ----------------------------------------------------
Readed chb09_06.edf
Rows created for chb09_06.edf
Readed chb09_08.edf
Rows created for chb09_08.edf
Readed chb09_19.edf
Rows created for chb09_19.edf
---------------------------------------------- Patient: chb10 ----------------------------------------------------
Readed chb10_12.edf
Rows created for chb10_12.edf
Readed chb10_20.edf
Rows created for chb10_20.edf
Readed chb10_27.edf
Rows created for chb10_27.edf
Readed chb10_30.edf
Rows created for chb10_30.edf
Readed chb10_31.edf
Rows 

  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)


Rows created for chb11_82.edf
Readed chb11_92.edf
Rows created for chb11_92.edf
Readed chb11_99.edf
Rows created for chb11_99.edf
---------------------------------------------- Patient: chb12 ----------------------------------------------------
Readed chb12_06.edf


KeyboardInterrupt: 