# EEG Classification
updated: Aug. 24, 2018

Data: https://www.physionet.org/pn4/eegmmidb/

## 1. Data Downloads

### Warning: Executing these blocks will automatically create directories and download datasets.

In [1]:
import requests
import re
import os
import pathlib
import urllib

import keras.layers
from keras.models import Sequential, model_from_json
from sklearn.preprocessing import OneHotEncoder 

import re

import warnings
warnings.filterwarnings('ignore')
    
import numpy as np
import pandas as pd

from glob import glob

from mne import find_events, Epochs, concatenate_raws, pick_types
from mne.channels import read_montage
from mne.io import read_raw_edf

In [2]:
CONTEXT = 'pn4/'
MATERIAL = 'eegmmidb/'
URL = 'https://www.physionet.org/' + CONTEXT + MATERIAL

USERDIR = '/Users/Jimmy/data/PhysioNet/'

page = requests.get(URL).text
FOLDERS = sorted(list(set(re.findall(r'S[0-9]+', page))))

URLS = [URL+x+'/' for x in FOLDERS]

In [3]:
# Warning: Executing this block will create folders
for folder in FOLDERS:
    pathlib.Path(USERDIR +'/'+ folder).mkdir(parents=True, exist_ok=True)

In [3]:
# Warning: Executing this block will start downloading data
for i, folder in enumerate(FOLDERS):
    page = requests.get(URLS[i]).text
    subs = list(set(re.findall(r'S[0-9]+R[0-9]+', page)))
    
    print('Working on {}, {:.1%} completed'.format(folder, (i+1)/len(FOLDERS)))
    for sub in subs:
        urllib.request.urlretrieve(URLS[i]+sub+'.edf', os.path.join(USERDIR, folder, sub+'.edf'))

NameError: name 'FOLDERS' is not defined

## 2. Raw Data Import

In [88]:
# Get file paths
PATH = '/Users/jimmy/data/PhysioNet/'
SUBS = glob(PATH + 'S[0-9]*')
FNAMES = sorted([x[-4:] for x in SUBS])

# Remove subject #89 with damaged data
FNAMES.remove('S089')

In [79]:
# Event codes mean different actions for two groups of runs
run_type_0 = '02'.split(',')
run_type_1 = '04,08,12'.split(',')
run_type_2 = '06,10,14'.split(',')

In [148]:
def get_data(subj_num=FNAMES, epoch_sec=0.0625):
    """ Import each subject`s trials and make a 3D array
        Output shape: (Trial*Channel*TimeFrames)
        
        Some edf+ files recorded at low sampling rate, 128Hz, are excluded. 
        Majority was sampled at 160Hz.
        
        epoch_sec: time interval for one segment of mashes
        sliding: distance of sliding window moving each time """
    
    # To calculated completion rate
    count = 0
    
    # Initiate X, y
    X = []
    y = []
    
    # fixed numbers
    nChan = 64 
    sfreq = 160
    sliding = epoch_sec/2 
    
    # Sub-function to assign X and X, y
    def append_X(n_segments, old_x):
        new_x = old_x + [data[:, int(sfreq*sliding*n):int(sfreq*sliding*(n+2))] for n in range(n_segments)\
                     if data[:, int(sfreq*sliding*n):int(sfreq*sliding*(n+2))].shape==(nChan, int(sfreq*epoch_sec))]
        return new_x
    
    def append_X_Y(run_type, event, old_x, old_y):
        # Number of sliding windows
        n_segments = int(event[1]/epoch_sec)*2-1
        
        # Instantiate new_x, new_y
        new_y = old_y
        new_x = old_x
        
        # y assignment
        if run_type == 1:
            if event[2] == 'T1':
                new_y = old_y + [1]*n_segments
                new_x = append_X(n_segments, old_x)

            elif event[2] == 'T2':
                new_y = old_y + [2]*n_segments
                new_x = append_X(n_segments, old_x)
        
        if run_type == 2:
            if event[2] == 'T1':
                new_y = old_y + [3]*n_segments
                new_x = append_X(n_segments, old_x)
            
            elif event[2] == 'T2':
                new_y = old_y + [4]*n_segments
                new_x = append_X(n_segments, old_x)
        
        return new_x, new_y
    
    # Iterate over subj_num: S001, S002, S003...
    for subj in subj_num:
        # Return completion rate
        count+=1
        print('working on {}, {:.1%} completed'.format(subj, count/len(subj_num)))

        # Get file names
        fnames = glob(os.path.join(PATH, subj, subj+'R*.edf'))
        fnames = [name for name in fnames if name[-6:-4] in run_type_0+run_type_1+run_type_2]
        
        for i, fname in enumerate(fnames):
            
            # Import data into MNE raw object
            raw = read_raw_edf(fname, preload=True, verbose=False)
            picks = pick_types(raw.info, eeg=True)
            
            if raw.info['sfreq'] != 160:
                print(f'{subj} is sampled at 128Hz so will be excluded.')
                break
            
            # High-pass filtering
            raw.filter(l_freq=1, h_freq=None, picks=picks)
            
            # Get annotation
            events = raw.find_edf_events()
            
            # Get data
            data = raw.get_data(picks=picks)
            
            # Number of this run
            which_run = fname[-6:-4]
            
            """ Assignment Starts """ 
            # run 1 - baseline (eye closed)
            if which_run in run_type_0:

                # Number of sliding windows
                n_segments = int((raw.n_times/(epoch_sec*sfreq))*2-1)
                
                # Append 0`s based on number of windows
                y.extend([0]*n_segments)
                X = append_X(n_segments, X)
                    
            # run 4,8,12 - imagine opening and closing left or right fist    
            elif which_run in run_type_1:
                
                for i, event in enumerate(events):
                    X, y = append_X_Y(run_type=1, event=event, old_x=X, old_y=y)
                        
            # run 6,10,14 - imagine opening and closing both fists or both feet
            elif which_run in run_type_2:
                   
                for i, event in enumerate(events):         
                    X, y = append_X_Y(run_type=2, event=event, old_x=X, old_y=y)
                        
    X = np.stack(X)
    y = np.array(y).reshape((-1,1))
    return X, y

In [149]:
## In order to test MNE raw object
#subj = FNAMES[0]
#fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
#raw = read_raw_edf(fnames[0], preload=True, verbose=False)

In [None]:
X,y = get_data(FNAMES)

working on S001, 0.9% completed
working on S002, 1.9% completed
working on S003, 2.8% completed
working on S004, 3.7% completed
working on S005, 4.6% completed
working on S006, 5.6% completed
working on S007, 6.5% completed
working on S008, 7.4% completed
working on S009, 8.3% completed
working on S010, 9.3% completed
working on S011, 10.2% completed
working on S012, 11.1% completed
working on S013, 12.0% completed
working on S014, 13.0% completed
working on S015, 13.9% completed
working on S016, 14.8% completed
working on S017, 15.7% completed
working on S018, 16.7% completed
working on S019, 17.6% completed
working on S020, 18.5% completed
working on S021, 19.4% completed
working on S022, 20.4% completed
working on S023, 21.3% completed
working on S024, 22.2% completed
working on S025, 23.1% completed
working on S026, 24.1% completed
working on S027, 25.0% completed
working on S028, 25.9% completed
working on S029, 26.9% completed
working on S030, 27.8% completed
working on S031, 28.

In [None]:
print(X.shape)
print(y.shape)

## 3. Data Preprocessing

In [31]:
# y backup
ori_y = y

In [32]:
# y encoding
oh = OneHotEncoder()
y = oh.fit_transform(ori_y).toarray()

In [33]:
# Shuffle trials
np.random.seed(42)
trials = X.shape[0]
shuffle_indices = np.random.permutation(trials)
X = X[shuffle_indices]
y = y[shuffle_indices]

In [34]:
# Test set seperation
test_ratio = 0.2
train_size = int(trials*(1-test_ratio))
X_train, X_test, y_train, y_test = X[:train_size,:,:], X[train_size:,:,:],\
                                    y[:train_size,:], y[train_size:,:]

In [14]:
# I will use Z-score scaler to reproduce Zhang2018

# Min-max scaling for X
#train_min = train_X.min(axis=(1,2), keepdims=True)
#train_max = train_X.max(axis=(1,2), keepdims=True)
#train_X = (train_X - train_min)/(train_max-train_min)

#test_min = test_X.min(axis=(1,2), keepdims=True)
#test_max = test_X.max(axis=(1,2), keepdims=True)
#test_X = (test_X - test_min)/(test_max-test_min)

In [35]:
# Z-score normalization
from sklearn.preprocessing import StandardScaler

scalers = {}
for i in range(X_train.shape[1]):
    scalers[i] = StandardScaler()
    X_train[:, i, :] = scalers[i].fit_transform(X_train[:, i, :]) 
    print('train {:.2%} done'.format((i+1)/X_train.shape[1]))

for i in range(X_test.shape[1]):
    X_test[:, i, :] = scalers[i].transform(X_test[:, i, :]) 
    print('test {:.2%} done'.format((i+1)/X_test.shape[1]))

train 1.56% done
train 3.12% done
train 4.69% done
train 6.25% done
train 7.81% done
train 9.38% done
train 10.94% done
train 12.50% done
train 14.06% done
train 15.62% done
train 17.19% done
train 18.75% done
train 20.31% done
train 21.88% done
train 23.44% done
train 25.00% done
train 26.56% done
train 28.12% done
train 29.69% done
train 31.25% done
train 32.81% done
train 34.38% done
train 35.94% done
train 37.50% done
train 39.06% done
train 40.62% done
train 42.19% done
train 43.75% done
train 45.31% done
train 46.88% done
train 48.44% done
train 50.00% done
train 51.56% done
train 53.12% done
train 54.69% done
train 56.25% done
train 57.81% done
train 59.38% done
train 60.94% done
train 62.50% done
train 64.06% done
train 65.62% done
train 67.19% done
train 68.75% done
train 70.31% done
train 71.88% done
train 73.44% done
train 75.00% done
train 76.56% done
train 78.12% done
train 79.69% done
train 81.25% done
train 82.81% done
train 84.38% done
train 85.94% done
train 87.50% don

In [36]:
## Make 2D meshes

# Import one raw EEG data to get electrode locations
subj = FNAMES[0]
fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
raw = read_raw_edf(fnames[3], preload=True, verbose=False)
ch_names = raw.info['ch_names'][:-1]

# 'ch_index' is a dictionary - keys: electrodes, vals: column index of electrodes
ch_index = {re.findall("\w+[0-9]?", i)[0]:ch_names.index(i) for i in ch_names}; ch_index

{'Fc5': 0,
 'Fc3': 1,
 'Fc1': 2,
 'Fcz': 3,
 'Fc2': 4,
 'Fc4': 5,
 'Fc6': 6,
 'C5': 7,
 'C3': 8,
 'C1': 9,
 'Cz': 10,
 'C2': 11,
 'C4': 12,
 'C6': 13,
 'Cp5': 14,
 'Cp3': 15,
 'Cp1': 16,
 'Cpz': 17,
 'Cp2': 18,
 'Cp4': 19,
 'Cp6': 20,
 'Fp1': 21,
 'Fpz': 22,
 'Fp2': 23,
 'Af7': 24,
 'Af3': 25,
 'Afz': 26,
 'Af4': 27,
 'Af8': 28,
 'F7': 29,
 'F5': 30,
 'F3': 31,
 'F1': 32,
 'Fz': 33,
 'F2': 34,
 'F4': 35,
 'F6': 36,
 'F8': 37,
 'Ft7': 38,
 'Ft8': 39,
 'T7': 40,
 'T8': 41,
 'T9': 42,
 'T10': 43,
 'Tp7': 44,
 'Tp8': 45,
 'P7': 46,
 'P5': 47,
 'P3': 48,
 'P1': 49,
 'Pz': 50,
 'P2': 51,
 'P4': 52,
 'P6': 53,
 'P8': 54,
 'Po7': 55,
 'Po3': 56,
 'Poz': 57,
 'Po4': 58,
 'Po8': 59,
 'O1': 60,
 'Oz': 61,
 'O2': 62,
 'Iz': 63}

In [37]:
def convert_mesh(X, ch_index=ch_index):
    
    mesh = np.zeros((X.shape[0], X.shape[2], 10, 11))
    
    # 1st line
    mesh[:, :, 0, 4:7] = np.swapaxes(X[:,21:24,:], 1, 2); print('1st finished')
    
    # 2nd line
    mesh[:, :, 1, 3:8] = np.swapaxes(X[:,24:29,:], 1, 2); print('2nd finished')
    
    # 3rd line
    mesh[:, :, 2, 1:10] = np.swapaxes(X[:,29:38,:], 1, 2); print('3rd finished')
    
    # 4th line
    mesh[:, :, 3, 1:10] = np.c_[X[:,ch_index['Ft7'],:].reshape(X.shape[0],-1,1),\
                        np.swapaxes(X[:,0:7,:], 1, 2), X[:, ch_index['Ft8'], :].reshape(X.shape[0],-1,1)]; print('4th finished')
    
    # 5th line
    mesh[:, :, 4, 0:11] = np.swapaxes(np.concatenate((X[:,(ch_index['T9'],ch_index['T7']),:],\
                        X[:,7:14,:], X[:, (ch_index['T8'],ch_index['T10']), :]), axis=1), 1, 2); print('5th finished')
    # 6th line
    mesh[:, :, 5, 1:10] = np.c_[X[:,ch_index['Tp7'],:].reshape(X.shape[0],-1,1),\
                        np.swapaxes(X[:,14:21,:], 1, 2), X[:, ch_index['Tp8'], :].reshape(X.shape[0],-1,1)]; print('6th finished')
               
    # 7th line
    mesh[:, :, 6, 1:10] = np.swapaxes(X[:, 46:55, :], 1, 2); print('7th finished')
    
    # 8th line
    mesh[:, :, 7, 3:8] = np.swapaxes(X[:, 55:60, :], 1, 2); print('8th finished')
    
    # 9th line
    mesh[:, :, 8, 4:7] = np.swapaxes(X[:, 60:63, :], 1, 2); print('9th finished')
    
    # 10th line
    mesh[:, :, 9, 5] = X[:, 63, :]; print('10th finished')
    
    return mesh

In [38]:
X_train, X_test = convert_mesh(X_train), convert_mesh(X_test)

1st finished
2nd finished
3rd finished
4th finished
5th finished
6th finished
7th finished
8th finished
9th finished
10th finished


In [26]:
# Check out the shape of the mesh
np.set_printoptions(precision=2, linewidth=100)
X_train[1][0]

array([[ 0.  ,  0.  ,  0.  ,  0.  , -0.07, -0.26, -0.17,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , -0.06, -0.22, -0.25, -0.35, -0.15,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.03, -0.37, -0.24, -0.28, -0.24, -0.13, -0.04,  0.01, -0.18,  0.  ],
       [ 0.  , -0.45, -0.43, -0.32, -0.33, -0.29, -0.19, -0.21, -0.33, -0.22,  0.  ],
       [-0.58, -0.37, -0.38, -0.31, -0.15, -0.23, -0.15, -0.1 , -0.2 , -0.12,  0.05],
       [ 0.  ,  0.05, -0.41, -0.28, -0.15, -0.12, -0.12, -0.09, -0.07, -0.03,  0.  ],
       [ 0.  , -0.15, -0.15, -0.12,  0.01,  0.07,  0.03,  0.23,  0.12, -0.05,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.08,  0.06,  0.09,  0.22,  0.11,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  , -0.04,  0.13,  0.14,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.07,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

## 4. Modeling - 2D Data Input

In [40]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])

# Cascade Architecture
model = Sequential()

#CNN
model.add(layers.Conv2D(filters=32, kernel_size=(3,3), activation='relu',\
                        padding='same', input_shape=input_shape, data_format='channels_first'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu',\
                        padding='same', data_format='channels_first'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu',\
                        padding='same', data_format='channels_first'))
model.add(layers.Dropout(0.2))

#FC
model.add(layers.Reshape((-1, )))
model.add(layers.Dense(1028, activation='relu'))
model.add(layers.Dropout(0.5))

#RNN
model.add(layers.LSTM(64, dropout=0.2, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, dropout=0.2))
model.add(layers.Dropout(0.2))
#FC
model.add(layers.Dense(1028, activation='relu'))
model.add(layers.Dropout(0.5))

#Sofmax
model.add(layers.Dense(4, activation='softmax'))

model.summary()

NameError: name 'X_train' is not defined

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy',
             metrics=['accuracy'])

# checkpoint
filepath="weightBest.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=500, callbacks=callbacks_list)

In [None]:
score = model.evaluate(test_X, test_y, verbose=0)*100
print(f'The loss:{score[0]}')
print(f'The accuracy:{score[1]}')

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_EEG1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('model_EEG1.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")