# EEG Classification
updated: Aug. 24, 2018

Data: https://www.physionet.org/pn4/eegmmidb/

## 1. Data Downloads

### Warning: Executing these blocks will automatically create directories and download datasets.

In [32]:
# System
import requests
import re
import os
import pathlib
import urllib

# Modeling & Preprocessing
import keras.layers as layers
from keras.models import Sequential, model_from_json
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from keras import initializers, optimizers

# Essential Data Handling
import numpy as np
import pandas as pd

# Get Paths
from glob import glob

# EEG package
from mne import find_events, Epochs, concatenate_raws, pick_types
from mne.channels import read_montage
from mne.io import read_raw_edf

In [2]:
CONTEXT = 'pn4/'
MATERIAL = 'eegmmidb/'
URL = 'https://www.physionet.org/' + CONTEXT + MATERIAL

# Change this directory according to your setting
USERDIR = '/Users/Jimmy/data/PhysioNet/'

page = requests.get(URL).text
FOLDERS = sorted(list(set(re.findall(r'S[0-9]+', page))))

URLS = [URL+x+'/' for x in FOLDERS]

In [3]:
# Warning: Executing this block will create folders
for folder in FOLDERS:
    pathlib.Path(USERDIR +'/'+ folder).mkdir(parents=True, exist_ok=True)

In [3]:
# Warning: Executing this block will start downloading data
for i, folder in enumerate(FOLDERS):
    page = requests.get(URLS[i]).text
    subs = list(set(re.findall(r'S[0-9]+R[0-9]+', page)))
    
    print('Working on {}, {:.1%} completed'.format(folder, (i+1)/len(FOLDERS)))
    for sub in subs:
        urllib.request.urlretrieve(URLS[i]+sub+'.edf', os.path.join(USERDIR, folder, sub+'.edf'))

NameError: name 'FOLDERS' is not defined

## 2. Raw Data Import

In [33]:
# Get file paths
PATH = '/Users/jimmy/data/PhysioNet/'
SUBS = glob(PATH + 'S[0-9]*')
FNAMES = sorted([x[-4:] for x in SUBS])

# Remove subject #89 with damaged data
FNAMES.remove('S089')

In [34]:
def get_data(subj_num=FNAMES, epoch_sec=0.0625):
    """ Import each subject`s trials and make a 3D array
        Output shape: (Trial*Channel*TimeFrames)
        
        Some edf+ files recorded at low sampling rate, 128Hz, are excluded. 
        Majority was sampled at 160Hz.
        
        epoch_sec: time interval for one segment of mashes
        """
    
    # Event codes mean different actions for two groups of runs
    run_type_0 = '02'.split(',')
    run_type_1 = '04,08,12'.split(',')
    run_type_2 = '06,10,14'.split(',')

    # To calculated completion rate
    count = 0
    
    # Initiate X, y
    X = []
    y = []
    
    # fixed numbers
    nChan = 64 
    sfreq = 160
    sliding = epoch_sec/2 
    
    # Sub-function to assign X and X, y
    def append_X(n_segments, old_x):
        new_x = old_x + [data[:, int(sfreq*sliding*n):int(sfreq*sliding*(n+2))] for n in range(n_segments)\
                     if data[:, int(sfreq*sliding*n):int(sfreq*sliding*(n+2))].shape==(nChan, int(sfreq*epoch_sec))]
        return new_x
    
    def append_X_Y(run_type, event, old_x, old_y):
        # Number of sliding windows
        n_segments = int(event[1]/epoch_sec)*2-1
        
        # Instantiate new_x, new_y
        new_y = old_y
        new_x = old_x
        
        # y assignment
        if run_type == 1:
            if event[2] == 'T1':
                new_y = old_y + [1]*n_segments
                new_x = append_X(n_segments, old_x)

            elif event[2] == 'T2':
                new_y = old_y + [2]*n_segments
                new_x = append_X(n_segments, old_x)
        
        if run_type == 2:
            if event[2] == 'T1':
                new_y = old_y + [3]*n_segments
                new_x = append_X(n_segments, old_x)
            
            elif event[2] == 'T2':
                new_y = old_y + [4]*n_segments
                new_x = append_X(n_segments, old_x)
        
        return new_x, new_y
    
    # Iterate over subj_num: S001, S002, S003...
    for subj in subj_num:
        # Return completion rate
        count+=1
        print('working on {}, {:.1%} completed'.format(subj, count/len(subj_num)))

        # Get file names
        fnames = glob(os.path.join(PATH, subj, subj+'R*.edf'))
        fnames = [name for name in fnames if name[-6:-4] in run_type_0+run_type_1+run_type_2]
        
        for i, fname in enumerate(fnames):
            
            # Import data into MNE raw object
            raw = read_raw_edf(fname, preload=True, verbose=False)
            picks = pick_types(raw.info, eeg=True)
            
            if raw.info['sfreq'] != 160:
                print(f'{subj} is sampled at 128Hz so will be excluded.')
                break
            
            # Get annotation
            events = raw.find_edf_events()
            
            # Get data
            data = raw.get_data(picks=picks)
            
            # Number of this run
            which_run = fname[-6:-4]
            
            """ Assignment Starts """ 
            # run 1 - baseline (eye closed)
            if which_run in run_type_0:

                # Number of sliding windows
                n_segments = int((raw.n_times/(epoch_sec*sfreq))*2-1)
                
                # Append 0`s based on number of windows
                y.extend([0]*n_segments)
                X = append_X(n_segments, X)
                    
            # run 4,8,12 - imagine opening and closing left or right fist    
            elif which_run in run_type_1:
                
                for i, event in enumerate(events):
                    X, y = append_X_Y(run_type=1, event=event, old_x=X, old_y=y)
                        
            # run 6,10,14 - imagine opening and closing both fists or both feet
            elif which_run in run_type_2:
                   
                for i, event in enumerate(events):         
                    X, y = append_X_Y(run_type=2, event=event, old_x=X, old_y=y)
                        
    X = np.stack(X)
    y = np.array(y).reshape((-1,1))
    return X, y

In [35]:
## In order to test MNE raw object
#subj = FNAMES[0]
#fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
#raw = read_raw_edf(fnames[5], preload=True, verbose=False)

In [36]:
X,y = get_data(FNAMES, epoch_sec=0.0625)

working on S001, 2.0% completed
working on S002, 4.0% completed
working on S003, 6.0% completed
working on S004, 8.0% completed
working on S005, 10.0% completed
working on S006, 12.0% completed
working on S007, 14.0% completed
working on S008, 16.0% completed
working on S009, 18.0% completed
working on S010, 20.0% completed
working on S011, 22.0% completed
working on S012, 24.0% completed
working on S013, 26.0% completed
working on S014, 28.0% completed
working on S015, 30.0% completed
working on S016, 32.0% completed
working on S017, 34.0% completed
working on S018, 36.0% completed
working on S019, 38.0% completed
working on S020, 40.0% completed
working on S021, 42.0% completed
working on S022, 44.0% completed
working on S023, 46.0% completed
working on S024, 48.0% completed
working on S025, 50.0% completed
working on S026, 52.0% completed
working on S027, 54.0% completed
working on S028, 56.0% completed
working on S029, 58.0% completed
working on S030, 60.0% completed
working on S03

In [1]:
print(X.shape)
print(y.shape)

NameError: name 'X' is not defined

## 3. Data Preprocessing

In [38]:
# y backup
ori_y = y

In [39]:
# y encoding
oh = OneHotEncoder()
y = oh.fit_transform(ori_y).toarray()

In [40]:
# Shuffle trials
np.random.seed(43)
trials = X.shape[0]
shuffle_indices = np.random.permutation(trials)
X = X[shuffle_indices]
y = y[shuffle_indices]

In [41]:
# Test set seperation
test_ratio = 0.2
train_size = int(trials*(1-test_ratio))
X_train, X_test, y_train, y_test = X[:train_size,:,:], X[train_size:,:,:],\
                                    y[:train_size,:], y[train_size:,:]

In [42]:
# Z-score Normalization
def scale_data(X):
    shape = X.shape
    scaler = StandardScaler()
    scaled_X = np.zeros((shape[0], shape[1], shape[2]))
    for i in range(shape[0]):
        for z in range(shape[2]):
            scaled_X[i, :, z] = np.squeeze(scaler.fit_transform(X[i, :, z].reshape(-1, 1)))
        if i%int(shape[0]/10) == 0:
            print('{:.2%} done'.format((i+1)/shape[0]))   
    return scaled_X
            
X_train, X_test  = scale_data(X_train), scale_data(X_test)

0.00% done
10.00% done
20.00% done
30.00% done
40.00% done
50.00% done
60.00% done
70.00% done
80.00% done
90.00% done
100.00% done
0.00% done
10.00% done
20.00% done
30.00% done
40.00% done
50.00% done
60.00% done
70.00% done
80.00% done
90.00% done
100.00% done


In [43]:
## Make 2D meshes
# Import one raw EEG data to get electrode locations
subj = FNAMES[0]
fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
raw = read_raw_edf(fnames[3], preload=True, verbose=False)
ch_names = raw.info['ch_names'][:-1]

# 'ch_index' is a dictionary - keys: electrodes, vals: column index of electrodes
ch_index = {re.findall("\w+[0-9]?", i)[0]:ch_names.index(i) for i in ch_names}; ch_index

{'Fc5': 0,
 'Fc3': 1,
 'Fc1': 2,
 'Fcz': 3,
 'Fc2': 4,
 'Fc4': 5,
 'Fc6': 6,
 'C5': 7,
 'C3': 8,
 'C1': 9,
 'Cz': 10,
 'C2': 11,
 'C4': 12,
 'C6': 13,
 'Cp5': 14,
 'Cp3': 15,
 'Cp1': 16,
 'Cpz': 17,
 'Cp2': 18,
 'Cp4': 19,
 'Cp6': 20,
 'Fp1': 21,
 'Fpz': 22,
 'Fp2': 23,
 'Af7': 24,
 'Af3': 25,
 'Afz': 26,
 'Af4': 27,
 'Af8': 28,
 'F7': 29,
 'F5': 30,
 'F3': 31,
 'F1': 32,
 'Fz': 33,
 'F2': 34,
 'F4': 35,
 'F6': 36,
 'F8': 37,
 'Ft7': 38,
 'Ft8': 39,
 'T7': 40,
 'T8': 41,
 'T9': 42,
 'T10': 43,
 'Tp7': 44,
 'Tp8': 45,
 'P7': 46,
 'P5': 47,
 'P3': 48,
 'P1': 49,
 'Pz': 50,
 'P2': 51,
 'P4': 52,
 'P6': 53,
 'P8': 54,
 'Po7': 55,
 'Po3': 56,
 'Poz': 57,
 'Po4': 58,
 'Po8': 59,
 'O1': 60,
 'Oz': 61,
 'O2': 62,
 'Iz': 63}

In [44]:
def convert_mesh(X, ch_index=ch_index):
    
    mesh = np.zeros((X.shape[0], X.shape[2], 10, 11))
    X = np.swapaxes(X, 1, 2)
    
    # 1st line
    mesh[:, :, 0, 4:7] = X[:,:,21:24]; print('1st finished')
    
    # 2nd line
    mesh[:, :, 1, 3:8] = X[:,:,24:29]; print('2nd finished')
    
    # 3rd line
    mesh[:, :, 2, 1:10] = X[:,:,29:38]; print('3rd finished')
    
    # 4th line
    mesh[:, :, 3, 1:10] = np.concatenate((X[:,:,ch_index['Ft7']].reshape(-1, X.shape[1], 1),\
                                          X[:,:,0:7], X[:,:,ch_index['Ft8']].reshape(-1, X.shape[1], 1)), axis=2)
    print('4th finished')
    
    # 5th line
    mesh[:, :, 4, 0:11] = np.concatenate((X[:,:,(ch_index['T9'],ch_index['T7'])],\
                                        X[:,:,7:14], X[:,:,(ch_index['T8'],ch_index['T10'])]), axis=2)
    print('5th finished')
    
    # 6th line
    mesh[:, :, 5, 1:10] = np.concatenate((X[:,:,ch_index['Tp7']].reshape(-1, X.shape[1], 1),\
                                        X[:,:,14:21], X[:,:,ch_index['Tp8']].reshape(-1, X.shape[1], 1)), axis=2)
    print('6th finished')
               
    # 7th line
    mesh[:, :, 6, 1:10] = X[:,:,46:55]; print('7th finished')
    
    # 8th line
    mesh[:, :, 7, 3:8] = X[:,:,55:60]; print('8th finished')
    
    # 9th line
    mesh[:, :, 8, 4:7] = X[:,:,60:63]; print('9th finished')
    
    # 10th line
    mesh[:, :, 9, 5] = X[:,:,63]; print('10th finished')
    
    return mesh

In [45]:
# Make meshes - Dimension: (Sample * Channel * Width * Height)
X_train, X_test = convert_mesh(X_train), convert_mesh(X_test)

1st finished
2nd finished
3rd finished
4th finished
5th finished
6th finished
7th finished
8th finished
9th finished
10th finished
1st finished
2nd finished
3rd finished
4th finished
5th finished
6th finished
7th finished
8th finished
9th finished
10th finished


In [46]:
# Check out the shape of the mesh
np.set_printoptions(precision=2, linewidth=100)
X_train[1][0]

array([[ 0.  ,  0.  ,  0.  ,  0.  , -1.29, -0.13, -1.19,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , -1.64, -0.73, -0.68, -1.39,  0.18,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  1.59,  0.28,  0.23,  0.63,  0.18,  0.33,  0.43,  0.68,  0.53,  0.  ],
       [ 0.  ,  1.79,  1.39,  1.49,  1.54,  0.43,  1.24,  0.73,  1.24, -0.13,  0.  ],
       [ 0.93,  1.24,  2.4 ,  1.19,  0.33, -0.83,  0.02,  0.13,  0.93,  0.33,  0.63],
       [ 0.  ,  1.49,  0.73,  0.53, -0.33, -1.19, -0.94, -0.94, -0.38, -0.33,  0.  ],
       [ 0.  ,  0.43, -0.18,  0.23, -0.43, -1.09, -1.9 , -1.8 , -1.19, -1.74,  0.  ],
       [ 0.  ,  0.  ,  0.  , -0.53,  0.02, -1.04, -1.34, -0.33,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  , -1.14, -0.08, -1.29,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  , -0.28,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

## 4. Modeling - Time-Distributed CNN + RNN

In [47]:
# Make another dimension, 1, to apply CNN for each time frame.
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], X_train.shape[3], 1)
X_test = X_test.reshape(X_test.shape[0], X_train.shape[1], X_train.shape[2], X_train.shape[3], 1)

In [55]:
## Simplified Model
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3], 1)

#CNN
CNN = Sequential()
CNN.add(layers.Conv2D(32, (3,3), padding='same', activation='elu', data_format='channels_last',kernel_initializer=lecun))
CNN.add(layers.Conv2D(64, (3,3), padding='same', activation='elu', data_format='channels_last',kernel_initializer=lecun))
CNN.add(layers.Flatten())
CNN.add(layers.Dense(128, activation='elu', kernel_initializer=lecun))
CNN.add(layers.Dropout(0.3))

#RNN
model = Sequential()
model.add(layers.TimeDistributed(CNN, input_shape=input_shape))
model.add(layers.LSTM(16, return_sequences=True, kernel_initializer=lecun))
model.add(layers.LSTM(16,kernel_initializer=lecun))
model.add(layers.Dense(32, activation='elu', kernel_initializer=lecun))

model.add(layers.Dense(5, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_2 (TimeDist (None, 10, 128)           920064    
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 16)            9280      
_________________________________________________________________
lstm_4 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 165       
Total params: 932,165
Trainable params: 932,165
Non-trainable params: 0
_________________________________________________________________


In [None]:
lecun = initializers.lecun_normal(seed=42)
adam = optimizers.adam(lr=0.001)

def sd_pred(y_true, y_pred):
    return K.std(y_pred)

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy', sd_pred])
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64, epochs=50)

Train on 162732 samples, validate on 40683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

In [40]:
## Complicated Model - the same as Zheng`s
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3], 1)

#CNN
CNN = Sequential()
CNN.add(layers.Conv2D(32, (3,3), padding='same', activation='elu', data_format='channels_last',kernel_initializer=lecun))
CNN.add(layers.Conv2D(64, (3,3), padding='same', activation='elu', data_format='channels_last',kernel_initializer=lecun))
CNN.add(layers.Conv2D(128, (3,3), padding='same', activation='elu', data_format='channels_last',kernel_initializer=lecun))
CNN.add(layers.Flatten())
CNN.add(layers.Dense(1024, activation='elu', kernel_initializer=lecun))
CNN.add(layers.Dropout(0.5))

#RNN
model = Sequential()
model.add(layers.TimeDistributed(CNN, input_shape=input_shape))
model.add(layers.LSTM(64, return_sequences=True, kernel_initializer=lecun))
model.add(layers.LSTM(64,kernel_initializer=lecun))
model.add(layers.Dense(1024, activation='elu', kernel_initializer=lecun))
CNN.add(layers.Dropout(0.5))

model.add(layers.Dense(5, activation='softmax'))

model.summary()

NameError: name 'X_train' is not defined

In [None]:
lecun = initializers.lecun_normal(seed=42)
adam = optimizers.adam(lr=0.001)

def sd_pred(y_true, y_pred):
    return K.std(y_pred)

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy', sd_pred])
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64, epochs=50)

In [None]:
# Save model to JSON
model_json = model.to_json()
with open("model_EEG1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
# load json and create model
#json_file = open('model_EEG1.json', 'r')
#loaded_model_json = json_file.read()
#json_file.close()
#loaded_model = model_from_json(loaded_model_json)
# load weights into new model
#loaded_model.load_weights("model.h5")
#print("Loaded model from disk")