## Load Guinea-Bissau data and save as numpy file

In [1]:
import numpy as np
import pandas as pd
from pandas import merge 
from os import listdir
from numpy import genfromtxt, random

In [2]:
datadir = "/media/windows-share/EEGs_Guinea-Bissau_cleaned"
outputdir = "/media/windows-share/EEGs_Guinea-Bissau_np"
filenames = listdir(datadir)
D = []
sf = 128
nc = 14
#Nfiles = len(filenames)
#X = np.zeros((Nfiles,maxtslength,nc)) 

id = list(map(int,list(map(lambda file: file[file.find('id')+2:file.find('dur')-1],filenames))))
dur = list(map(int,list(map(lambda file: file[file.find('dur')+3:file.find('epoch')-1],filenames))))
epoch = list(map(int,list(map(lambda file: file[file.find('epoch')+5:file.find('gro')-1],filenames))))
group = list(map(str,list(map(lambda file: file[file.find('gro')+3:file.find('.csv')],filenames))))
protocol = list(map(str,list(map(lambda file: file[file.find('yes')+3:file.find('id')-1],filenames))))
mydata = id, dur, epoch, group
df = pd.DataFrame.from_items([('id',id),('dur',dur),('epoch',epoch),('group',group),('filenames',filenames),
                              ('protocol',protocol)])

In [3]:
logstructure = []
for mindur in [4,10]: # minimum duration of an epoch in minutes
    for protocol in ['open','closed']:
        df2 = df[(df['protocol']==protocol) & (df['dur'] >= mindur)]
        maxtslength = mindur * sf
        #Identify training, test and validation group
        con = np.unique(df2[df2['group'] == 'Control']['id'])
        epi = np.unique(df2[df2['group'] == 'Epilepsy']['id'])
        Nid = len(con) + len(epi) #number of ids
        prop = len(con) / Nid #proportion of controls
        random.seed(300)
        def getids(x,y,prop,N):
            ix = np.sort(np.random.choice(x,round(N*prop),replace=False))
            iy = np.sort(np.random.choice(y,round(N*(1-prop)),replace=False))
            if (len(ix)+len(iy)) < 20:
                print(prop,N,len(x),len(y))
            x = [x for i,x in enumerate(x) if x not in ix]    
            y = [x for i,x in enumerate(y) if x not in iy]    
            icon = np.concatenate((ix,iy))
            return icon, x, y
        ival, con, epi = getids(con,epi,prop,N=20) # validation set
        ites, con, epi = getids(con,epi,prop,N=20) # test set
        itra = np.concatenate((con, epi)) # training set
        #print(len(ival),len(ites),len(itra))
        # Now use identifies per group to load the data
        for subset in ['train','valid','test']:
            conditionname = subset+'_'+str(mindur)+'seconds_'+protocol
            if subset == 'train':
                tmp = df2[df2.id.isin(itra)]
                filenames = tmp['filenames']
            if subset == 'valid':
                tmp = df2[(df2.id.isin(ival))]
                tmp = tmp.sort_values(by=['id','epoch']).groupby('id').first() # select first available epoch
                filenames = tmp['filenames']
            if subset == 'test':
                tmp = df2[(df2.id.isin(ival))]
                tmp = tmp.sort_values(by=['id','epoch']).groupby('id').first() # select first available epoch
                filenames = tmp['filenames']
            X = np.zeros((0,maxtslength,nc)) #len(filenames)
            print(conditionname + ' ' + str(len(filenames)))
            for file in filenames:
                path = datadir + '/' + file
                D = pd.read_csv(path, sep=',',header=0,usecols=list(range(1,15)))
                if D.shape[0] > maxtslength:
                    D = np.array(D[0:maxtslength]) # take first part or should these be a random selection?
                    D = np.reshape(D,(1,D.shape[0],D.shape[1]))
                    X = np.vstack((X,D))
                    logstructure.append([subset,mindur,protocol,file])
            fnameX = outputdir + '/X_' + conditionname
            fnamey = outputdir + '/y_' + conditionname
            np.save(file=fnameX,arr=X)
            y = np.array(tmp['group'])
            np.save(file=fnamey,arr=y)            
np.savetxt(outputdir + '/log.csv', logstructure,
           delimiter=",", fmt='%s')

train_4seconds_open 101
valid_4seconds_open 20
test_4seconds_open 20
train_4seconds_closed 137
valid_4seconds_closed 20
test_4seconds_closed 20
train_10seconds_open 78
valid_10seconds_open 20
test_10seconds_open 20
train_10seconds_closed 108
valid_10seconds_closed 20
test_10seconds_closed 20


In [4]:
print(fnameX)
print(fnamey)
testreadX = np.load(file=fnameX+'.npy')
testready = np.load(file=fnamey+'.npy')

/media/windows-share/EEGs_Guinea-Bissau_np/X_test_10seconds_closed
/media/windows-share/EEGs_Guinea-Bissau_np/y_test_10seconds_closed


In [5]:
print(testreadX.shape)
print(testready.shape)

(20, 1280, 14)
(20,)
