# Participant Split

In [1]:
import numpy as np
import pickle
import random
import os
import re
from tqdm import tqdm


# Select whether Angle or Power shall be loaded. 
# For the participant split, angle with eyes closed was chosen.
angle = True
closed = True
domain = "_A_" if angle else "_P_" 
condition = "_closed" if closed else "_open" 


# Participant Split

Note: the participant split varies from run to run, even though I have set a seed. The current split provides a test and train set with a quite equal balance of CC and DC participants; Running this cell will generate a new split, which might not yield  the same results! For safety reasons, I have chosen a different target folder for the splits, than where the currently used splits reside. The split generated here is written to the "Max" folder, while the splits used for P8 and 32 channel classification are in the respective folders (same split files in both folders).

In [2]:
# Participant split according to 60:40

os.chdir("/home/bpn/Documents/Max/NPZ_200_150/")

# Filter by domain and condition
files = [file for file in os.listdir() if domain in file and condition in file]

# Read IDs from valid timepoints
id_list = []

# Iterate over NPZ files to extract all occuring participant IDs
for file in tqdm(files):
    try:
        #Extract ID from name and append it to list
        id_ = re.search('TVD_\d{4}', file)[0]
        id_list.append(id_)

    except:
        pass
            

# Split unique IDs by random permutation
id_set = list(set(id_list)) # Deduplicate list by converting it to a set
permutation = np.random.RandomState(seed = 270597).permutation(len(id_set)) # Random permutation of unique IDs with fixed seed
train_ids = [id_set[ind] for ind in permutation[:round(0.6*len(id_set))]] # Select 60% for Training
test_ids = [id_set[ind] for ind in permutation[round(0.6*len(id_set)):]] #Select the rest (40%) for Testing


# Save train/test IDs
os.chdir("/home/bpn/Documents/Max")

with open('train_ids', 'wb') as fp:
    pickle.dump(train_ids, fp)

with open('test_ids', 'wb') as fp:
    pickle.dump(test_ids, fp)

100%|███████████████████████████████████| 6880/6880 [00:00<00:00, 583779.64it/s]


# Check number of participants per group:

In [3]:
# I read the split files from the P8 folder, to which I copied them. 
# This is done for safety reasons, in case someone runs the cell above by mistake and generates new splits
os.chdir("/home/bpn/Documents/Max/P8_final")

# Load IDs for Train-Test split 
with open('train_ids', 'rb') as fp:
    train_ids = pickle.load(fp)
    
with open('test_ids', 'rb') as fp:
    test_ids = pickle.load(fp)

# Define DC and CC lists
dc_list = []
for i in ("206","200","181","177","148","144","136","135", "127","112","104","205","010","156","059","606","620","622","624","627"):
    dc_list.append("TVD_0" + i)
    
cc_list = []
for i in ("001", "042", "015", "277", "274", "012", "050", "016", "052", "070","243","229","050","256","240", "145"):
    cc_list.append("TVD_0" + i)

print("No. of CCs in trainset: %i"%len([id_ for id_ in train_ids if id_ in cc_list]))
print("No. of DCs in trainset: %i"%len([id_ for id_ in train_ids if id_ in dc_list]))
print("No. of CCs in testset: %i"%len([id_ for id_ in test_ids if id_ in cc_list]))
print("No. of DCs in testset: %i"%len([id_ for id_ in test_ids if id_ in dc_list]))

No. of CCs in trainset: 8
No. of DCs in trainset: 12
No. of CCs in testset: 5
No. of DCs in testset: 7


# Extract valid timepoints and split by test- and train-IDs
(This needs to be done only once, per domain (Phase and Power) and per condition (closed and open), since the results of this cell are saved as .npy files. If some else takes over this project, the cell below can be skipped, as the files are already in the correct folders)

In [None]:
# I read the split files from the P8 folder, to which I copied them. 
# This is done for safety reasons, in case someone runs the cell above by mistake and generates new splits
os.chdir("/home/bpn/Documents/Max/P8_final")

# Load IDs for Train-Test split 
with open('train_ids', 'rb') as fp:
    train_ids = pickle.load(fp)
    
with open('test_ids', 'rb') as fp:
    test_ids = pickle.load(fp)

    
os.chdir("/home/bpn/Documents/Max/NPZ_200_150")

# Filter by domain and condition
files = [file for file in os.listdir() if domain in file and condition in file]

# Filter Timepoints
timepoints = []
for file in files:
    timepoints.append(re.search('\d{3}.\d{2}', file)[0].split(".")[0])

# Logic behind valid timepoints: Select only those timepoints that are available for all participants
valid_timepoints = set([x for x in set(timepoints) \
                        if (timepoints.count(x) == timepoints.count('002')) \
                        and (x not in['000','001'])]) #Drop seconds 0 and 1 due to artifacts
valid_timepoints.remove(max(valid_timepoints)) # Remove latest common timepoint (due to artifacts)
valid_timepoints.remove(max(valid_timepoints)) # Remove second latest common timepoint (due to artifacts)


# Read in suitable .npz files
train_tensors = []
train_targets = []
test_tensors = []
test_targets = []

# Iterate over NPZ files in directory
for file in tqdm(files):
    timepoint = re.search('\d{3}.\d{2}', file)[0].split(".")[0] # extract timepoint with regex
    if timepoint in valid_timepoints: # Only process timepoints that are in the valid list
        
        try:
            #Extract participant ID from filesname with regex
            id_ = re.search('TVD_\d{4}', file)[0]
            
            #Code target, depending on whether the participant ID is in the CC or DC list
            if id_ in cc_list:
                target = 1
            elif id_ in dc_list:
                target = 0
            else:
                raise ValueError()
            
            # Load volume, i.e., the NPZ file
            vol = iio.volread('%s'%file)
            vol = np.transpose(vol, (1,2,0))
            
            # Append tensors and targets to lists
            if id_ in train_ids:
                train_tensors.append(vol)
                train_targets.append(target)
            elif id_ in test_ids:
                test_tensors.append(vol)
                test_targets.append(target)
            else:
                raise ValueError()
            
            
        except:
            pass
            
    else: 
        pass

######################################
# Extract and save 32 channel data ###
######################################
os.chdir("/home/bpn/Documents/Max/32_channels")

np.save("Data/32_train%s%s.npy"%(domain, condition),train_tensors)
np.save("Data/32_test%s%s.npy"%(domain, condition),test_tensors)

np.save("Data/target_train%s%s.npy"%(domain, condition),train_targets)
np.save("Data/target_test%s%s.npy"%(domain, condition),test_targets)


##############################
# Extract and save P8 data ###
##############################
os.chdir("/home/bpn/Documents/Max/P8_final")

p8_train = []
p8_test = []

for tensor in train_tensors:
    p8_train.append(tensor[:,:,27]) # P8 is the 28th channel, thus 27 as index (Due to 0 indexing in Python)

for tensor in test_tensors:
    p8_test.append(tensor[:,:,27]) # P8 is the 28th channel, thus 27 as index (Due to 0 indexing in Python)
    
np.save("Data/p8_train%s%s.npy"%(domain, condition),p8_train)
np.save("Data/p8_test%s%s.npy"%(domain, condition),p8_test)

np.save("Data/target_train%s%s.npy"%(domain, condition),train_targets)
np.save("Data/target_test%s%s.npy"%(domain, condition),test_targets)
