In [72]:
import os
import sys
import time
import pandas as pd
import numpy as np
from os import listdir
from generate_epoch import * # make sure to have this file inside the ML folder
from scipy.signal import butter, sosfiltfilt, sosfreqz  # for filtering
from neurodsp import filt
import random
# for models:
from pyriemann.estimation import XdawnCovariances
from pyriemann.tangentspace import TangentSpace
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

# getDataFrame
### !!!Make sure to have consistent naming for both the eeg and event files as well as equal number of eeg and event files!!!

In [2]:
data_folder = 'data'
data_files = os.listdir(data_folder)
data_files = [i for i in data_files if '.csv' in i]
data_files_eeg = [i for i in data_files if 'eeg' in i]
data_files_event = [i for i in data_files if 'event' in i]
data_files_eeg.sort() # Make sure to have consistent naming for both the eeg and event files
data_files_event.sort()
data_eeg = [pd.read_csv('data/'+i) for i in data_files_eeg]
data_event = [pd.read_csv('data/'+i) for i in data_files_event]

total_participant = 1
trial_per_subj = 3
stimulus_per_subj = 120
channels = ['C4','C2','C1','C3']
epoch_s = 0 
epoch_e = 4000 #4 seconds
fs = 250
epoch_len = int((abs(epoch_s) + abs(epoch_e)) * (fs / 1000))
epoch_data = np.empty((0, stimulus_per_subj, len(channels), epoch_len), float)
lowcut = 0.5
highcut = 100
bl_s = -400
bl_e = -300

In [3]:
for i in range(len(data_event)): # Convert event time into 4ms increment while matching with the eeg time
    adjusted_time = []
    for time in data_event[i]['timestamp']:
        index = data_eeg[i].loc[data_eeg[i]['timestamp'] <= time].index.values[-1] # Pick the latest eeg time before the event time
        adjusted_time.append(index * 4)
    data_event[i]['timestamp'] = adjusted_time
    data_event[i] = data_event[i].rename(columns={'timestamp' : 'time'})

for i in range(len(data_eeg)):# Convert eeg time into 4ms increment
    eeg = data_eeg[i]
    eeg['timestamp'] = list(range(0,len(eeg)*4,4))
    data_eeg[i] = data_eeg[i].rename(columns={'timestamp' : 'time'})
    data_eeg[i] = data_eeg[i].rename(columns={'4':'C4', '5':'C2', '6':'C1', '7':'C3'})

In [4]:
# Extract the labels from the events
event_start = [event.loc[event['event'].str.contains('start')] for event in data_event]
for es in event_start:
    es['event'] = [int(i[-1]) for i in np.array(es['event'])]
event_start_label = event_start
event_start_label = [esl.drop('time', axis = 1) for esl in event_start_label]
# Convert the labels into np arrays
# labels = [esl.values.reshape(len(esl),) for esl in event_start_label]
# labels = np.concatenate(labels, axis = None)
for i in range(len(data_eeg)): # add 'EventStart' in eeg data
    es = [0] * len(data_eeg[i])
    sts = event_start[i]['time']
    for st in sts:
        es[round(st/4)] = 1
    data_eeg[i]['EventStart'] = es

# save csv in data/generate_epoch for the epoching step
event_start_label = pd.concat(event_start_label)
event_start_label.to_csv('data/generate_epoch_labels/Labels.csv', index = False, header = None)
for i in range(len(data_eeg)):
    f = 'EEG' + str(i+1) + '.csv'
    data_eeg[i].to_csv('data/generate_epoch/' + f, index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [5]:
data_list_arr = np.array(sorted(listdir('data/generate_epoch')))
data_list_arr = data_list_arr[1:]
data_list_np = np.reshape(data_list_arr, (total_participant,trial_per_subj))

In [6]:
def bandpass_bandstop_filter(data,fs, lowcut, highcut, order = 2):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], analog = False, btype = 'band', output = 'sos')
    filted_data = sosfiltfilt(sos, data)
    filted_data = filt.filter_signal(filted_data, fs, 'bandstop', (58, 62), n_seconds=1)
    filted_data = filted_data[~np.isnan(filted_data)]
    return filted_data

In [7]:
for participant_id in range(total_participant):
    subject_dir_list = data_list_np[participant_id]
    subject_epoch = np.empty((0, len(channels), epoch_len), float)
    for trial_id in range(trial_per_subj):
        subject_dir = subject_dir_list[trial_id]
        data = generate_epoch(file_path = 'data/generate_epoch/'+subject_dir, channels = channels, \
                              eeg_filter = bandpass_bandstop_filter, fs = fs, epoch_s = epoch_s, \
                              epoch_e = epoch_e, bl_s = bl_s, bl_e = bl_e)
        subject_epoch = np.vstack((subject_epoch, data))
    subject_epoch = np.reshape(subject_epoch, (1, stimulus_per_subj, len(channels), epoch_len))
    epoch_data = np.vstack((epoch_data, subject_epoch))

print('Epoched data shape: '+ str(epoch_data.shape))


Epoched data shape: (1, 120, 4, 1000)


In [28]:
y = pd.read_csv('data/generate_epoch_labels/Labels.csv', header = None)
y = y.values.reshape(len(y),)
predf=np.array(list(zip(y, epoch_data[0]))) # alternatively use dstack to speed up the process

In [29]:
df = pd.DataFrame(data=predf, columns=["Trial_Type", "EEG_data"])

In [31]:
# return df

# getArrays
### TODO: add function for reducing the number of trials while keeping the trial types balanced

In [75]:
# params: ...
# df = getDataframe(...)
# default x_dur = X.shape[-1]
Y = df['Trial_Type'].to_numpy()
X = df['EEG_data'].to_numpy()
X = np.stack(list(X), axis = 0)
x_dur = X.shape[-1] # default

# return X[:,:,x_dur], Y

# generateModel(X, Y)

In [98]:
#Default 4:1 split
#Suffle
temp=list(zip(Y, X))
random.shuffle(temp)
Y, X = zip(*temp)
#Split train/test
X_train = np.array(X[:int(len(X)*4/(4+1))])
X_test = np.array(X[int(len(X)*4/(4+1)):])
Y_train = np.array(Y[:int(len(Y)*4/(4+1))])
Y_test = np.array(Y[int(len(Y)*4/(4+1)):])

class myModel:
    def __init__(self):
        pass
    def fit(X, Y):
        pass
    def predict(X):
        pass
    
class XDawnLRModel(myModel): # XDAWN Covariance Preprocessing + Linear Regression Classifier
    def __init__(self):
        super().__init__()
        self.XC = XdawnCovariances(nfilter = 1) # the number of filters can be changed
        self.logreg = LogisticRegression()
        
    def fit(self, X, Y):
        X_transformed = self.XC.fit_transform(X, Y)
        X_transformed = TangentSpace(metric='riemann').fit_transform(X_transformed)
        self.logreg.fit(X_transformed,Y)
        
    def predict(self, X):
        X_transformed = self.XC.transform(X)
        X_transformed = TangentSpace(metric='riemann').fit_transform(X_transformed)
        return self.logreg.predict(X_transformed)

model = XDawnLRModel()
model.fit(X_train, Y_train)
#return model

In [99]:
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           1       0.43      0.50      0.46         6
           2       0.25      0.33      0.29         6
           3       0.20      0.25      0.22         4
           4       0.25      0.12      0.17         8

    accuracy                           0.29        24
   macro avg       0.28      0.30      0.28        24
weighted avg       0.29      0.29      0.28        24

