# EEG Classification
updated: Aug. 21, 2018

Data: https://www.physionet.org/pn4/eegmmidb/

## 1. Data Downloads

### Warning: Executing these blocks will automatically create directories and download datasets.

In [1]:
import requests
import re
import os
import pathlib
import urllib

In [2]:
CONTEXT = 'pn4/'
MATERIAL = 'eegmmidb/'
URL = 'https://www.physionet.org/' + CONTEXT + MATERIAL

USERDIR = '/Users/Jimmy/data/PhysioNet/'

page = requests.get(URL).text
FOLDERS = sorted(list(set(re.findall(r'S[0-9]+', page))))

URLS = [URL+x+'/' for x in FOLDERS]

In [3]:
#for folder in FOLDERS:
#    pathlib.Path(USERDIR + folder).mkdir(parents=True, exist_ok=True)

In [3]:
for i, folder in enumerate(FOLDERS):
    page = requests.get(URLS[i]).text
    subs = list(set(re.findall(r'S[0-9]+R[0-9]+', page)))
    
    print('Working on {}, {:.1%} completed'.format(folder, (i+1)/len(FOLDERS)))
    for sub in subs:
        urllib.request.urlretrieve(URLS[i]+sub+'.edf', os.path.join(USERDIR, folder, sub+'.edf'))
        urllib.request.urlretrieve(URLS[i]+sub+'.edf.event', os.path.join(USERDIR, folder, sub+'.edf.event'))

NameError: name 'FOLDERS' is not defined

## 2. Raw Data Import

In [2]:
import warnings
warnings.filterwarnings('ignore')
    
import numpy as np
import pandas as pd

from glob import glob

from mne import find_events, Epochs, concatenate_raws, pick_types
from mne.channels import read_montage
from mne.io import read_raw_edf

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Get file paths
PATH = '/Users/jimmy/data/PhysioNet/'
SUBS = glob(PATH+ 'S[0-9]*')
FNAMES = sorted([x[-4:] for x in SUBS])

# Remove subject #89 with damaged data
FNAMES.remove('S089')

In [4]:
# Event codes mean different actions for two groups of runs
event_0 = '01,02'.split(',')
event_1 = '03,04,07,08,11,12'.split(',')
event_2 = '05,06,09,10,13,14'.split(',')

In [95]:
def get_data(subj_num=FNAMES, include_rest = False):
    """Import each subject`s trials and make a 3D array
        The output shape: (Trial*Channel*TimeFrames)
        Some edf+ files recorded at low sampling rate, 128Hz are excluded. 
        Majority was sampled at 160Hz."""
    # To calculated the completion rate
    count=0
    
    # Initiate X, y
    X = []
    y = []
    
    nChan = 64
    
    # Epoch period
    epoch_sec = 0.065 # i.e. 10 rows for one segment
    sliding = 0.03125 # i.e. 50% of overlapping between segments
    
    # Frequencies
    sfreq = 160
    
    for subj in subj_num:
        count+=1
        print('working on {}, {:.1%} completed'.format(subj, count/len(subj_num)))
        
        fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
    

        for i, fname in enumerate(fnames):
            
            # Import data into MNE raw object
            raw = read_raw_edf(fname, preload=True, verbose=False)
            picks = pick_types(raw.info, eeg=True)
                
            if raw.info['sfreq'] != 160:
                print(f'{subj} is sampled at 128Hz so will be excluded.')
                break
            
            # High-pass filtering
            raw.filter(l_freq=1, h_freq=None, picks=picks)
            
            # Get annotation
            events = raw.find_edf_events()
            
            # Get data
            data = raw.get_data(picks=picks)
            
            # Experiment number 0,1
            if fname[-6:-4] in event_0 and include_rest:

                # Number of sliding windows
                n_segments = int((raw.n_times/(epoch_sec*sfreq))*2-1)
                
                y.extend([0]*n_segments)
                
                for n in range(n_segments):
                    X.append(data[:, int(sfreq*sliding*n):int(sfreq*sliding*(n+2))])
                    
                    
                    # Check out the shape
                    if X[-1].shape != (nChan, int(sfreq*epoch_sec)): 
                        
                        print(F'shape error!: {fname}, {X[-1].shape}') 
                        X, y = X[:-1], y[:-1]
                    
            # Experiment number 3,4,7,8,11,12        
            if fname[-6:-4] in event_1:
                
                for i, event in enumerate(events):
                    
                    if not include_rest and event[2] == 'T0':
                        continue
                    
                    # Number of sliding windows
                    n_segments = int((event[1]/(epoch_sec))*2-1)
                    
                    # y assignment
                    if event[2] == 'T0':
                        y.extend([0]*n_segments)
                    elif event[2] == 'T1':
                        y.extend([1]*n_segments)
                    elif event[2] == 'T2':
                        y.extend([2]*n_segments)
                        
                    # X assignment    
                    for n in range(n_segments):        
                        X.append(data[:, int((event[0]+n*sliding)*sfreq):int((event[0]+(n+2)*sliding)*sfreq)])
                           
                        
                        # Check out the shape 
                        if X[-1].shape != (nChan, int(sfreq*epoch_sec)): 
                            print(F'shape error!: {fname}, {X[-1].shape}')
                            X, y = X[:-1], y[:-1]
                        
            # Experiment number 5,6,9,10,13,14
            elif fname[-6:-4] in event_2:
                   
                for i, event in enumerate(events):
                    
                    if not include_rest and event[2] == 'T0':
                        continue                  

                    # Number of sliding windows
                    n_segments = int((event[1]/(epoch_sec))*2-1)
                    
                    # y assignment
                    if event[2] == 'T0':
                        y.extend([0]*n_segments)
                    elif event[2] == 'T1':
                        y.extend([3]*n_segments)
                    elif event[2] == 'T2':
                        y.extend([4]*n_segments)
                        
                    for n in range(n_segments):
                        X.append(data[:, int((event[0]+n*sliding)*sfreq):int((event[0]+(n+2)*sliding)*sfreq)])
                        
                        # Check out the shape
                        if X[-1].shape != (nChan, int(sfreq*epoch_sec)): 
                            print(F'shape error!: {fname}, {X[-1].shape}')
                            X, y = X[:-1], y[:-1]
                        
    X = np.stack(X)
    y = np.array(y).reshape((-1,1))
    return X, y

In [88]:
subj = FNAMES[0]
fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
raw = read_raw_edf(fnames[5], preload=True, verbose=False)

In [96]:
X,y = get_data(FNAMES)

working on S001, 0.9% completed
working on S002, 1.9% completed
working on S003, 2.8% completed
working on S004, 3.7% completed
working on S005, 4.6% completed
working on S006, 5.6% completed
working on S007, 6.5% completed
working on S008, 7.4% completed
working on S009, 8.3% completed
working on S010, 9.3% completed
working on S011, 10.2% completed
working on S012, 11.1% completed
working on S013, 12.0% completed
working on S014, 13.0% completed
working on S015, 13.9% completed
working on S016, 14.8% completed
working on S017, 15.7% completed
working on S018, 16.7% completed
working on S019, 17.6% completed
working on S020, 18.5% completed
working on S021, 19.4% completed
working on S022, 20.4% completed
working on S023, 21.3% completed
working on S024, 22.2% completed
working on S025, 23.1% completed
working on S026, 24.1% completed
working on S027, 25.0% completed
working on S028, 25.9% completed
working on S029, 26.9% completed
working on S030, 27.8% completed
working on S031, 28.

In [97]:
X.shape

(2360480, 64, 10)

In [98]:
y.shape

(2360480, 1)

## 3. Data Preparation

In [99]:
from keras.layers import Conv1D, Dense, Flatten, MaxPool1D, AveragePooling1D, Dropout, LSTM, embeddings
from keras.models import Sequential, model_from_json
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import OneHotEncoder 

import re

Using TensorFlow backend.


In [100]:
# y backup
ori_y = y

In [101]:
# y encoding
oh = OneHotEncoder()
y = oh.fit_transform(ori_y).toarray()

In [102]:
# Shuffle trials
np.random.seed(42)
trials = X.shape[0]
shuffle_indices = np.random.permutation(trials)
X = X[shuffle_indices]
y = y[shuffle_indices]

In [103]:
# Test set seperation
test_ratio = 0.2
train_size = int(trials*(1-test_ratio))
X_train, X_test, y_train, y_test = X[:train_size,:,:], X[train_size:,:,:],\
                                    y[:train_size,:], y[train_size:,:]

In [14]:
# I will use Z-score scaler to reproduce Zhang2018

# Min-max scaling for X
#train_min = train_X.min(axis=(1,2), keepdims=True)
#train_max = train_X.max(axis=(1,2), keepdims=True)
#train_X = (train_X - train_min)/(train_max-train_min)

#test_min = test_X.min(axis=(1,2), keepdims=True)
#test_max = test_X.max(axis=(1,2), keepdims=True)
#test_X = (test_X - test_min)/(test_max-test_min)

In [None]:
# Z-score normalization
from sklearn.preprocessing import StandardScaler

scalers = {}
for i in range(X_train.shape[1]):
    scalers[i] = StandardScaler()
    X_train[:, i, :] = scalers[i].fit_transform(X_train[:, i, :]) 
    print('train {:.2%} done'.format((i+1)/X_train.shape[1]))

for i in range(X_test.shape[1]):
    X_test[:, i, :] = scalers[i].transform(X_test[:, i, :]) 
    print('test {:.2%} done'.format((i+1)/X_test.shape[1]))

train 1.56% done
train 3.12% done
train 4.69% done
train 6.25% done
train 7.81% done
train 9.38% done
train 10.94% done
train 12.50% done
train 14.06% done
train 15.62% done
train 17.19% done
train 18.75% done
train 20.31% done
train 21.88% done
train 23.44% done
train 25.00% done
train 26.56% done
train 28.12% done
train 29.69% done
train 31.25% done
train 32.81% done
train 34.38% done
train 35.94% done
train 37.50% done
train 39.06% done
train 40.62% done
train 42.19% done
train 43.75% done
train 45.31% done
train 46.88% done
train 48.44% done


In [None]:
## Make 2D meshes

# Import one raw EEG data to get electrode locations
subj = FNAMES[0]
fnames = glob(os.path.join(PATH, subj, subj+'R*'+'.edf'))
raw = read_raw_edf(fnames[3], preload=True, verbose=False)
ch_names = raw.info['ch_names'][:-1]

# 'ch_index' is a dictionary - keys: electrodes, vals: column index of electrodes
ch_index = {re.findall("\w+[0-9]?", i)[0]:ch_names.index(i) for i in ch_names}; ch_index

In [None]:
def convert_mesh(X, ch_index=ch_index):
    
    mesh = np.zeros((X.shape[0], X.shape[2], 10, 11))
    
    # 1st line
    mesh[:, :, 0, 4:7] = np.swapaxes(X[:,21:24,:], 1, 2); print('1st finished')
    
    # 2nd line
    mesh[:, :, 1, 3:8] = np.swapaxes(X[:,24:29,:], 1, 2); print('2nd finished')
    
    # 3rd line
    mesh[:, :, 2, 1:10] = np.swapaxes(X[:,29:38,:], 1, 2); print('3rd finished')
    
    # 4th line
    mesh[:, :, 3, 1:10] = np.c_[X[:,ch_index['Ft7'],:].reshape(X.shape[0],-1,1),\
                        np.swapaxes(X[:,0:7,:], 1, 2), X[:, ch_index['Ft8'], :].reshape(X.shape[0],-1,1)]; print('4st finished')
    
    # 5th line
    mesh[:, :, 4, 0:11] = np.swapaxes(np.concatenate((X[:,(ch_index['T9'],ch_index['T7']),:],\
                        X[:,7:14,:], X[:, (ch_index['T8'],ch_index['T10']), :]), axis=1), 1, 2); print('5st finished')
    # 6th line
    mesh[:, :, 5, 1:10] = np.c_[X[:,ch_index['Tp7'],:].reshape(X.shape[0],-1,1),\
                        np.swapaxes(X[:,14:21,:], 1, 2), X[:, ch_index['Tp8'], :].reshape(X.shape[0],-1,1)]; print('6st finished')
               
    # 7th line
    mesh[:, :, 6, 1:10] = np.swapaxes(X[:, 46:55, :], 1, 2); print('7st finished')
    
    # 8th line
    mesh[:, :, 7, 3:8] = np.swapaxes(X[:, 55:60, :], 1, 2); print('8st finished')
    
    # 9th line
    mesh[:, :, 8, 4:7] = np.swapaxes(X[:, 60:63, :], 1, 2); print('9st finished')
    
    # 10th line
    mesh[:, :, 9, 5] = X[:, 63, :]; print('10st finished')
    
    return mesh

In [None]:
X_train, X_test = convert_mesh(X_train), convert_mesh(X_test)

In [26]:
# Check out the shape of the mesh
np.set_printoptions(precision=2, linewidth=100)

X_train[1][0]

array([[ 0.  ,  0.  ,  0.  ,  0.  , -0.07, -0.26, -0.17,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , -0.06, -0.22, -0.25, -0.35, -0.15,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.03, -0.37, -0.24, -0.28, -0.24, -0.13, -0.04,  0.01, -0.18,  0.  ],
       [ 0.  , -0.45, -0.43, -0.32, -0.33, -0.29, -0.19, -0.21, -0.33, -0.22,  0.  ],
       [-0.58, -0.37, -0.38, -0.31, -0.15, -0.23, -0.15, -0.1 , -0.2 , -0.12,  0.05],
       [ 0.  ,  0.05, -0.41, -0.28, -0.15, -0.12, -0.12, -0.09, -0.07, -0.03,  0.  ],
       [ 0.  , -0.15, -0.15, -0.12,  0.01,  0.07,  0.03,  0.23,  0.12, -0.05,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.08,  0.06,  0.09,  0.22,  0.11,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  , -0.04,  0.13,  0.14,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.07,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

## 4. Modeling - 2D Data Input

In [None]:
from keras import layers

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])

# Cascade Architecture
model = Sequential()

#CNN
model.add(layers.Conv2D(filters=32, kernel_size=(3,3), activation='relu',\
                        padding='same', input_shape=input_shape, data_format='channels_first'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu',\
                        padding='same', data_format='channels_first'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu',\
                        padding='same', data_format='channels_first'))
model.add(layers.Dropout(0.2))

#FC
model.add(layers.Reshape((64, -1)))
model.add(layers.Dense(1028, activation='relu'))
model.add(layers.Dropout(0.5))

#RNN
model.add(layers.LSTM(64, dropout=0.2, return_sequences=True))
model.add(layers.LSTM(64, dropout=0.2))

#FC
model.add(layers.Dense(1028, activation='relu'))
model.add(layers.Dropout(0.5))

#Sofmax
model.add(layers.Dense(5, activation='softmax'))

model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy',
             metrics=['accuracy'])
model.fit(train_X, train_y, batch_size=256, epochs=15)

In [None]:
score = model.evaluate(test_X, test_y, verbose=0)*100
print(f'The accuracy:{score[1]*100}')

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_EEG1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('model_EEG1.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")