# BIDSAlign Matlab to Pickle converter

This notebook converts all mat files created with the BIDSAlign library to pickle
files. 

We perform this conversion because it is much more efficient to load a pickle file in a Python environment compared to .mat files.

This code is heavily based on the one of the following [link](https://github.com/MedMaxLab/eegprepro/blob/main/MatlabToPickle.ipynb)


In [None]:
import glob 
import json
import numpy as np
import os
import pickle
import random
from   scipy.io import loadmat
import sys
import tqdm

In [None]:
# ------------------- SET PATH TO DATA -------------------
# DON'T FORGET TO PUT THE SEPARATOR AT THE END OF THE PATH

path_to_data = 'path/to/data/'
if path_to_data[-1] != os.sep:
    path_to_data += os.sep

In [None]:
# ------------------- SET PATH TO DATA -------------------
# DON'T FORGET TO PUT THE SEPARATOR AT THE END OF THE PATH

# ------------------------- NOTE -------------------------
# save path must be a directory containing the ica subdirectory
save_path = 'path/to/save/data/'
if save_path[-1] != os.sep:
    save_path += os.sep

In [None]:
paths = glob.glob(path_to_data + '*.mat')

# sort paths according to file name
paths = sorted(paths, 
               key = lambda x: (int(x.split(os.sep)[-1].split('_')[0]),
                                int(x.split(os.sep)[-1].split('_')[1]),
                                int(x.split(os.sep)[-1].split('_')[2]),
                                int(x.split(os.sep)[-1].split('.')[0].split('_')[3])
                               )
              )

# extract file_name 
file_names = [paths[i].split(os.sep)[-1] for i in range(len(paths))]

# extract ds code
ds_code = path_to_data.split(os.sep)[-2][:8]

# extract preprocessing modality
prepro_mode = "_".join(path_to_data.split(os.sep)[-2].split('_')[1:])
if prepro_mode[0] == "_":
    prepro_mode = prepro_mode[1:]

# print ds_code and prepro_mode to double check that everything was extracted correctly
print(ds_code)
print(prepro_mode)

In [None]:
# double check cell. It verifies that file names and paths are aligned
for i in [random.randint(0,len(file_names)-1) for i in range(5)]:
    print(paths[i],file_names[i])

In [None]:
# create pickle files 
if prepro_mode.casefold() == 'ica':
    save_path += 'ica/'
elif prepro_mode.casefold() in ['iir_wica_ica', 'iir_wica_ica_asr']:
    save_path += 'iir_wica_ica/'

# double check that the extracted save path is correct
print(save_path)

In [None]:
#Task: Eyes Open - Eyes Closed - Only control
if ds_code == 'ds004148':
    dataset_info = {'dataset_id': 2, 'tot_subjects': 60, 'tot_sessions': 2,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'EC', 1: 'EO'},
                   }    
    for i in tqdm.tqdm(range(len(paths))):
        data = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        # label = 0 if label=='EC' else 1
        label = 0
        eeg = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)
        
# Task: Control - Parkinson's disease
elif ds_code == 'ds002778':
    dataset_info = {'dataset_id': 8, 'tot_subjects': 31, 'tot_sessions': 1,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)', 1: 'Parkinson (POFF)'},
                   }      
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='C' else 1
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)


# Task: Control - Parkinson's disease
elif ds_code == 'ds003490':
    dataset_info = {'dataset_id': 5, 'tot_subjects': 50, 'tot_sessions': 1,
                    'sampling_rate': 250, 'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {0: 'Control (C)', 1: 'Parkinson (POFF)'},
                   }      
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['label_group']
        label = 0 if label=='C' else 1
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Task: Cognitive - 4 Class Parkinson's disease
elif ds_code == 'ds004584':
    dataset_info = {'dataset_id': 19, 'tot_subjects': 149,
                    'tot_sessions': 1, 'sampling_rate': 250,
                    'reference': 'Common Average',
                    'template': data['DATA_STRUCT']['template'].tolist(),
                    'channels_interpolated': data['DATA_STRUCT']['pad_file'].tolist(),
                    'added_channels': data['DATA_STRUCT']['template'] \
                                      [data['DATA_STRUCT']['pad_file']>0].tolist(),
                    'labels' : {
                        0: 'Control (C)',
                        1: 'Parkinson Disease (PD)',
                        2: 'Parkinson Disease with Mild Cognitive Impairment (PDMCI)',
                        3: 'Parkinson Disease with Dementia (PDD)',
                    },
                   }       
    for i in tqdm.tqdm(range(len(paths))):
        data  = loadmat(paths[i], simplify_cells = True)
        label = data['DATA_STRUCT']['subj_info']['GROUP']
        if label == 'Control':
            label = 0
        elif label == 'PD':
            label = 1
        elif label == 'PDMCI':
            label = 2
        elif label == 'PDD':
            label = 3
        else:
            raise ValueError('Wrong Label')
        eeg   = {'data': data['DATA_STRUCT']['data'], 'label': label }
        with open(save_path + file_names[i].split('.')[0] + '.pickle', 'wb') as handle:
            pickle.dump(eeg, handle, protocol = pickle.HIGHEST_PROTOCOL)