# BIDSAlign Matlab to Pickle converter

This notebook converts all mat files created with the BIDSAlign library to pickle
files. 

The reasons behind this conversion are multiple:

1. It is much more efficient to load a pickle file in a Python environment 
compared to .mat files. 
2. Second, a dataset summary stored in a json file is created during the conversion. 
Inside the json file, many information can be found, such as the number of subjects, 
the number of sessions, the sampling rate, the EEG reference, 
the added channels (interpolated), the eeg template (data row --> eeg channel), 
and the label meaning (number --> condition or task). 
3. files originally grouped in several folds based on the dataset ID, 
condition or task, and preprocessing pipeline, are stored together based only on the 
preprocessing pipeline.

The final result, after multiple running of this notebook for each folder with the
.mat files, is a directory with 4 subdirectories, one for each preprocessing pipeline.
Each folder will have all the files preprocessed with a specific preprocessing pipeline
where each file will contain a dictionary with fields 'data' for the 2d numpy array and
'label' for the label number (a single scalar).

**additional note:** the only things to set up are the root path to all the files
preprocessed with bidsalign and the output path. The datasets path is expected to 
contain a set of subdirectories with name

_dsxxxxxLABEL_PIPELINE_ (e.g., ds004362RHI_FILT),

while the output path is expected to contain 4 subdirectories with names 

*raw*, *filt*, *ica*, *icasr*



In [None]:
import glob 
import json
import numpy as np
import os
import pickle
import random
from   scipy.io import loadmat
import sys
import tqdm

In [None]:
# ------------------- SET PATH TO DATA -------------------
# DON'T FORGET TO PUT THE SEPARATOR AT THE END OF THE PATH

path_to_data = '/path/to/data/'
if path_to_data[-1] != os.sep:
    path_to_data += os.sep

In [None]:
# ------------------- SET PATH TO DATA -------------------
# DON'T FORGET TO PUT THE SEPARATOR AT THE END OF THE PATH

# ------------------------- NOTE -------------------------
# save path must be a directory containing 4 subdirectories 
# with names associated to a specific pipeline. In particular:
# 1. raw    --    2. filt    --    3. ica    --    4. icasr
save_path = '/data/delpup/eegpickle/'
if save_path[-1] != os.sep:
    save_path += os.sep

In [None]:
paths = glob.glob(path_to_data + '*.mat')

# sort paths according to file name
paths = sorted(paths, 
               key = lambda x: (int(x.split(os.sep)[-1].split('_')[0]),
                                int(x.split(os.sep)[-1].split('_')[1]),
                                int(x.split(os.sep)[-1].split('_')[2]),
                                int(x.split(os.sep)[-1].split('.')[0].split('_')[3])
                               )
              )

# extract file_name 
file_names = [paths[i].split(os.sep)[-1] for i in range(len(paths))]

# extract ds code
ds_code = path_to_data.split(os.sep)[-2][:8]

# extract preprocessing modality
prepro_mode = path_to_data.split(os.sep)[-2].split('_')[-1]

# print ds_code and prepro_mode to double check that everything was extracted correctly
print(ds_code)
print(prepro_mode)

In [None]:
# double check cell. It verifies that file names and paths are aligned
for i in [random.randint(0,len(file_names)-1) for i in range(5)]:
    print(paths[i],file_names[i])

In [None]:
# create pickle files 
if prepro_mode.casefold() == 'icasr':
    save_path += 'icasr/'
elif prepro_mode.casefold() == 'filt':
    save_path += 'filt/'
elif prepro_mode.casefold() == 'raw':
    save_path += 'raw/'
elif prepro_mode.casefold() == 'ica':
    save_path += 'ica/'

# double check that the extracted save path is correct
print(save_path)

In [None]:
#Task: Eyes Open - Eyes Closed 
if ds_code == 'ds004148':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('EoecClassification/Summary/template.npy',data['DATA_STRUCT']['template'])
    np.save('EoecClassification/Summary/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 2, 'tot_subjects': 60, 'tot_sessions': 2,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'EC', 1: 'EO'},
                   }
    with open('EoecClassification/Summary/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='EC' else 1
        eeg = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Task: Alzheimer - Frontotemporal Dementia - Control
elif ds_code == 'ds004504':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('AlzClassification/Summary/template.npy',data['DATA_STRUCT']['template'])
    np.save('AlzClassification/Summary/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 10, 'tot_subjects': 88, 'tot_sessions': 1,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)', 
                                1: 'Frontotemporal Dementia (F)',
                                2: 'Alzheimer (A)',
                               },
                   }
    with open('AlzClassification/Summary/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='C' else 1 if label=='F' else 2
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)


# Task: Control - Parkinson's disease
elif ds_code == 'ds002778':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('PDClassification/Summary/ds002778/template.npy', 
            data['DATA_STRUCT']['template'])
    np.save('PDClassification/Summary/ds002778/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 8, 'tot_subjects': 31, 'tot_sessions': 1,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)', 1: 'Parkinson (POFF)'},
                   }
    with open('PDClassification/Summary/ds002778/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='C' else 1
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)


# Task: Control - Parkinson's disease
elif ds_code == 'ds003490':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('PDClassification/Summary/ds003490/template.npy', 
            data['DATA_STRUCT']['template'])
    np.save('PDClassification/Summary/ds003490/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 5, 'tot_subjects': 50, 'tot_sessions': 1,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)', 1: 'Parkinson (POFF)'},
                   }
    with open('PDClassification/Summary/ds003490/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='C' else 1
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Task: Left Imagery - Right Imagery
elif ds_code == 'ds004362':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('MIClassification/Summary/template.npy', 
            data['DATA_STRUCT']['template'])
    np.save('MIClassification/Summary/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 25, 'tot_subjects': 106,
                    'tot_sessions': 3, 'subject_excluded': [88, 92, 100],
                    'sampling_rate': 160, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {4: 'Left Hand Imagery (LHI)',
                                5: 'Right Hand Imagery (RHI)'},
                   }
    with open('MIClassification/Summary/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if data['DATA_STRUCT']['label_map']=='LHI' else 1
        if data['DATA_STRUCT']['label_map']!='LHI' and data['DATA_STRUCT']['label_map'] != 'RHI':
            print('Wrong label')
            break
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Task: First Episode Psychosis
elif ds_code == 'ds003947':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('FEPClassification/Summary/template.npy', 
            data['DATA_STRUCT']['template'])
    np.save('FEPClassification/Summary/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 7, 'tot_subjects': 61,
                    'tot_sessions': 2, 'sampling_rate': 250,
                    'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)',
                                1: 'First Episode Psychosis (PSY)'},
                   }
    with open('FEPClassification/Summary/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if data['DATA_STRUCT']['label_group']=='C' else 1
        if data['DATA_STRUCT']['label_pipeline'].casefold() != prepro_mode.casefold():
            print('Wrong pipeline')
            break
        if data['DATA_STRUCT']['label_group']!='C' and data['DATA_STRUCT']['label_group'] != 'PSY':
            print('Wrong label')
            break
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Task: Sleep Deprivation
elif ds_code == 'ds004902':
    data = loadmat(paths[0], simplify_cells = True)
    np.save('SleepClassification/Summary/template.npy', 
            data['DATA_STRUCT']['template'])
    np.save('SleepClassification/Summary/channels_interpolated.npy',
            data['DATA_STRUCT']['pad_file'])
    dataset_info = {'dataset_id': 20, 'tot_subjects': 71,
                    'tot_sessions': 2, 'sampling_rate': 250,
                    'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Normal Sleep Eyes Open  (NSEO)',
                                1: 'Sleep Deprivation Eyes Open (SDEO)'},
                   }
    with open('SleepClassification/Summary/dataset_info.json', 'w') as json_file:
        json.dump(dataset_info, json_file)        
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if data['DATA_STRUCT']['label_group']=='NSEO' else 1
        if data['DATA_STRUCT']['label_group']!='NSEO' and data['DATA_STRUCT']['label_group'] != 'SDEO':
            print('Wrong label')
            break
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)