### Loading the data in Python

In [None]:
import h5py
import numpy as np
import pandas as pd
import glob
import os
from tqdm import tqdm


In [None]:
# Define a function to load the data from the hdf5 file
def load_h5py_file(file_path):
    data = {
        'neural_features': [],
        'n_time_steps': [],
        'seq_class_ids': [],
        'seq_len': [],
        'transcriptions': [],
        'sentence_label': [],
        'session': [],
        'block_num': [],
        'trial_num': [],
    }
    # Open the hdf5 file for that day
    with h5py.File(file_path, 'r') as f:

        keys = list(f.keys())

        # For each trial in the selected trials in that day
        for key in keys:
            g = f[key]

            neural_features = g['input_features'][:]
            n_time_steps = g.attrs['n_time_steps']
            seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
            seq_len = g.attrs['seq_len'] if 'seq_len' in g.attrs else None
            transcription = g['transcription'][:] if 'transcription' in g else None
            sentence_label = g.attrs['sentence_label'][:] if 'sentence_label' in g.attrs else None
            session = g.attrs['session']
            block_num = g.attrs['block_num']
            trial_num = g.attrs['trial_num']

            data['neural_features'].append(neural_features)
            data['n_time_steps'].append(n_time_steps)
            data['seq_class_ids'].append(seq_class_ids)
            data['seq_len'].append(seq_len)
            data['transcriptions'].append(transcription)
            data['sentence_label'].append(sentence_label)
            data['session'].append(session)
            data['block_num'].append(block_num)
            data['trial_num'].append(trial_num)
    return data


In [24]:
DATA_PATH = 'data/t15_copyTask_neuralData/hdf5_data_final'

# Recursively search for all *train.hdf5 files under all subdirectories of DATA_PATH
def get_data_files(data_type='train'):
    """
    Return a list of files matching the given data type ('train', 'valid', or 'test').
    """
    return glob.glob(os.path.join(DATA_PATH, '**', f'*{data_type}.hdf5'), recursive=True)

In [27]:
train_files = get_data_files('train')
train_files

['data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.11.03/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.11.04/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.09.01/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2025.03.14/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2024.05.10/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2024.04.28/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2024.03.08/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2024.03.15/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2025.01.10/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.10.20/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.08.13/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data_final/t15.2023.08.25/data_train.hdf5',
 'data/t15_copyTask_neuralData/hdf5_data

In [28]:
# Load all data into a single DataFrame
def load_data(files):
    df = pd.DataFrame()

    for file in tqdm(files, desc="Loading data"):
        data = load_h5py_file(file)
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
    return df


In [29]:
train_df = load_data(train_files)

Loading data: 100%|██████████| 45/45 [00:57<00:00,  1.28s/it]


In [30]:
train_df.head()

Unnamed: 0,neural_features,n_time_steps,seq_class_ids,seq_len,transcriptions,sentence_label,session,block_num,trial_num
0,"[[-0.23488846, 0.5014211, -0.75813776, -0.5213...",544,"[6, 40, 36, 17, 21, 40, 15, 25, 40, 12, 5, 23,...",14,"[73, 32, 119, 105, 108, 108, 32, 103, 111, 32,...",I will go around.,t15.2023.11.03,1,0
1,"[[-0.23703846, 0.3616628, -0.7747983, 0.937466...",641,"[6, 40, 2, 22, 40, 31, 4, 20, 17, 24, 40, 31, ...",24,"[73, 32, 97, 109, 32, 116, 97, 108, 107, 105, ...",I am talking to my family.,t15.2023.11.03,1,1
2,"[[-0.23900536, -0.8310198, -0.75461406, -0.489...",694,"[17, 31, 40, 17, 38, 40, 21, 33, 20, 17, 24, 4...",22,"[73, 116, 32, 105, 115, 32, 108, 111, 111, 107...",It is looking quite hard.,t15.2023.11.03,1,2
3,"[[-0.24192314, 0.2851043, -0.7366848, -0.48644...",691,"[36, 6, 40, 9, 25, 23, 31, 40, 37, 34, 40, 20,...",19,"[87, 104, 121, 32, 100, 111, 110, 39, 116, 32,...",Why don't you come here.,t15.2023.11.03,1,3
4,"[[-0.24300373, -0.8231351, -0.71545774, -0.482...",918,"[6, 40, 37, 34, 39, 3, 36, 3, 21, 18, 40, 15, ...",29,"[73, 32, 117, 115, 117, 97, 108, 108, 121, 32,...",I usually go home by this time.,t15.2023.11.03,1,4


In [None]:
# Check the shape of the neural features data
for i in range(5):
    print(f"Shape of data item {i}: {np.array(train_df['neural_features'][i]).shape}")

Shape of data item 0: (544, 512)
Shape of data item 1: (641, 512)
Shape of data item 2: (694, 512)
Shape of data item 3: (691, 512)
Shape of data item 4: (918, 512)
