In [2]:

import h5py
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os

# --- Define the base directory ---
BASE_DIR = '/kaggle/input/brain-to-text-25-data/t15_copyTask_neuralData/hdf5_data_final'

NEURAL_DATA_KEY = 'input_features'
TRANSCRIPTION_KEY = 'transcription'

def load_metadata_from_hdf5(file_path):
    """
    Correctly loads metadata based on the now-known HDF5 structure.
    """
    metadata = []
    try:
        with h5py.File(file_path, 'r') as f:
            # The top-level keys ARE the trials.
            for trial_key in f.keys():
                trial_group = f[trial_key]
                
                # Check if the group contains the correct dataset names
                if isinstance(trial_group, h5py.Group) and NEURAL_DATA_KEY in trial_group and TRANSCRIPTION_KEY in trial_group:
                    
                    num_time_bins = trial_group[NEURAL_DATA_KEY].shape[0]
                    
                    # The transcription is an array of integers, not a string.
                    # We will load it as a list of numbers for now.
                    transcription_ids = list(trial_group[TRANSCRIPTION_KEY][()])
                    
                    metadata.append({
                        'trial_id': trial_key,
                        'num_time_bins': num_time_bins,
                        'transcription_ids': transcription_ids,
                        # We can't get num_words directly yet, so we'll estimate from the length of the ID list.
                        # This might not be perfect but is a good start.
                        'num_words_estimate': len(transcription_ids) 
                    })
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        import traceback
        traceback.print_exc()
        
    return metadata

def load_test_metadata_from_hdf5(file_path):
    """ Loads test data which only has input_features. """
    metadata = []
    try:
        with h5py.File(file_path, 'r') as f:
            for trial_key in f.keys():
                trial_group = f[trial_key]
                if isinstance(trial_group, h5py.Group) and NEURAL_DATA_KEY in trial_group:
                    num_time_bins = trial_group[NEURAL_DATA_KEY].shape[0]
                    metadata.append({
                        'trial_id': trial_key,
                        'num_time_bins': num_time_bins
                    })
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return metadata


# --- Main Loading Loop ---
all_metadata = []
session_dirs = sorted([d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))])

for session in tqdm(session_dirs, desc="Processing Sessions"):
    session_path = os.path.join(BASE_DIR, session)
    for split in ['train', 'val', 'test']:
        file_name = f'data_{split}.hdf5'
        file_path = os.path.join(session_path, file_name)
        
        if os.path.exists(file_path):
            if split == 'test':
                session_metadata = load_test_metadata_from_hdf5(file_path)
            else:
                session_metadata = load_metadata_from_hdf5(file_path)
            
            # Add session and split info to the found trials
            for item in session_metadata:
                item['session'] = session
                item['split'] = split
            all_metadata.extend(session_metadata)

df = pd.DataFrame(all_metadata)

# --- Final Verification and Display ---
print(f"Loaded a total of {len(df)} trials.")
if not df.empty:
    print(f"Data splits:\n{df['split'].value_counts()}")
    # We no longer have 'sentence_text', so display the new columns
    display(df.head())
else:
    print("DataFrame is still empty. This indicates a very unusual issue.")


Processing Sessions:   0%|          | 0/45 [00:00<?, ?it/s]

Loaded a total of 10948 trials.
Data splits:
split
train    8072
test     1450
val      1426
Name: count, dtype: int64


Unnamed: 0,trial_id,num_time_bins,transcription_ids,num_words_estimate,session,split
0,trial_0000,321,"[66, 114, 105, 110, 103, 32, 105, 116, 32, 99,...",500.0,t15.2023.08.11,train
1,trial_0001,481,"[77, 121, 32, 102, 97, 109, 105, 108, 121, 32,...",500.0,t15.2023.08.11,train
2,trial_0002,480,"[87, 104, 97, 116, 32, 100, 111, 32, 116, 104,...",500.0,t15.2023.08.11,train
3,trial_0003,502,"[72, 111, 119, 32, 105, 115, 32, 116, 104, 97,...",500.0,t15.2023.08.11,train
4,trial_0004,402,"[78, 101, 101, 100, 32, 104, 101, 108, 112, 32...",500.0,t15.2023.08.11,train


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df.head()

In [3]:
import h5py
import sys

def load_h5py_file(file_path):
    data = {
        'neural_features': [],
        'n_time_steps': [],
        'seq_class_ids': [],
        'seq_len': [],
        'transcriptions': [],
        'sentence_label': [],
        'session': [],
        'block_num': [],
        'trial_num': [],
        'corpus': [],
    }
    # Open the hdf5 file for that day
    with h5py.File(file_path, 'r') as f:

        keys = list(f.keys())

        # For each trial in the selected trials in that day
        for key in keys:
            g = f[key]

            neural_features = g['input_features'][:]
            n_time_steps = g.attrs['n_time_steps']
            seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
            seq_len = g.attrs['seq_len'] if 'seq_len' in g.attrs else None
            transcription = g['transcription'][:] if 'transcription' in g else None
            sentence_label = g.attrs['sentence_label'][:] if 'sentence_label' in g.attrs else None
            session = g.attrs['session']
            block_num = g.attrs['block_num']
            trial_num = g.attrs['trial_num']

            # match this trial up with the csv to get the corpus name
            year, month, day = session.split('.')[1:]
            date = f'{year}-{month}-{day}'

            data['neural_features'].append(neural_features)
            data['n_time_steps'].append(n_time_steps)
            data['seq_class_ids'].append(seq_class_ids)
            data['seq_len'].append(seq_len)
            data['transcriptions'].append(transcription)
            data['sentence_label'].append(sentence_label)
            data['session'].append(session)
            data['block_num'].append(block_num)
            data['trial_num'].append(trial_num)
    return data

#Generate all referencable file names in the main dataset, split into train, test, and val.
def generate_file_names(BASE_DIR):
    file_names_train = []
    file_names_test = []
    file_names_val = []
    for folder in os.listdir(BASE_DIR):
        folder_name = BASE_DIR + '/' + folder
        for file in os.listdir(folder_name):
            file_name = BASE_DIR + '/' + folder + '/' + file
            if file == 'data_train.hdf5':
                file_names_train.append(file_name)
            elif file == 'data_test.hdf5':
                file_names_test.append(file_name)
            elif file == 'data_val.hdf5':
                file_names_val.append(file_name)
            else:
                raise Exception('Unrecognized file name')
            

    return file_names_train, file_names_test, file_names_val


file_names_train, file_names_test, file_names_val = generate_file_names(BASE_DIR)

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from kaggle_secrets import UserSecretsClient
from peft import IA3Config, get_peft_model
import os
import h5py
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

NEURAL_DATA_KEY = 'input_features'
TRANSCRIPTION_KEY = 'transcription'

# Metadata loading function provided by competition host
def load_metadata_from_hdf5(file_path):
    """
    Correctly loads metadata based on the now-known HDF5 structure.
    """
    metadata = []
    try:
        with h5py.File(file_path, 'r') as f:
            # The top-level keys ARE the trials.
            for trial_key in f.keys():
                trial_group = f[trial_key]
                
                # Check if the group contains the correct dataset names
                if isinstance(trial_group, h5py.Group) and NEURAL_DATA_KEY in trial_group and TRANSCRIPTION_KEY in trial_group:
                    
                    num_time_bins = trial_group[NEURAL_DATA_KEY].shape[0]
                    
                    # The transcription is an array of integers, not a string.
                    # We will load it as a list of numbers for now.
                    transcription_ids = list(trial_group[TRANSCRIPTION_KEY][()])
                    
                    metadata.append({
                        'trial_id': trial_key,
                        'num_time_bins': num_time_bins,
                        'transcription_ids': transcription_ids,
                        # We can't get num_words directly yet, so we'll estimate from the length of the ID list.
                        # This might not be perfect but is a good start.
                        'num_words_estimate': len(transcription_ids) 
                    })
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        import traceback
        traceback.print_exc()
        
    return metadata

# Metadata loading function provided by competition host
def load_test_metadata_from_hdf5(file_path):
    """ Loads test data which only has input_features. """
    metadata = []
    try:
        with h5py.File(file_path, 'r') as f:
            for trial_key in f.keys():
                trial_group = f[trial_key]
                if isinstance(trial_group, h5py.Group) and NEURAL_DATA_KEY in trial_group:
                    num_time_bins = trial_group[NEURAL_DATA_KEY].shape[0]
                    metadata.append({
                        'trial_id': trial_key,
                        'num_time_bins': num_time_bins
                    })
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return metadata

def decode_transcription(ids: np.ndarray) -> str:
    """
    Converts a NumPy array of ASCII character IDs to a string, 
    stopping at the first null (0) character.
    """
    if not isinstance(ids, np.ndarray):
        ids = np.array(ids, dtype=np.uint8)

    zero_indices = np.where(ids == 0)[0]
    end = zero_indices[0] if zero_indices.size > 0 else len(ids)
    return "".join(map(chr, ids[:end]))


class b2tDataset(Dataset):
    
    '''
    Provides a dataset for accessing training, validation, and test data.
    
    sets: [train|val|test] specifies the split held by the dataset
    dir:  [str] directory to search for hdf5 data files
    '''
    def __init__(self, 
                 sets="train",
                 dir="/kaggle/input/brain-to-text-25-data/t15_copyTask_neuralData/hdf5_data_final/",
                 tokenizer = None
                ):
        
        self.set_type = sets
        self.files = []
        self.session_counts = []
        self.all_metadata = []

        '''if tokenizer != None:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")'''
    
        #Get a list of files for the datasplit
        for dirname, _, filenames in os.walk(dir):
            for filename in filenames:
                if self.set_type in filename:
                    self.files.append(os.path.join(dirname, filename))

        # Get metadata for the files in the dataset
        for file_path in tqdm(self.files, desc="Processing sessions"):
            if os.path.exists(file_path):
                if self.set_type == 'test':
                    session_metadata = load_test_metadata_from_hdf5(file_path)
                else:
                    session_metadata = load_metadata_from_hdf5(file_path)
                
                self.session_counts.append(len(session_metadata))
                self.all_metadata.extend(session_metadata) 
    
    def __len__(self):
        return sum(self.session_counts)

    def __getitem__(self, idx):
        total = 0
        for i,count in enumerate(self.session_counts):
            total += count
            if total > idx:
                break
        file = self.files[i]
        index = idx - total + count

        target = None
        with h5py.File(file, 'r') as f:
            inputs = torch.from_numpy(f[list(f.keys())[index]]['input_features'][:])
            if self.set_type != "test":
                target = f[list(f.keys())[index]]['transcription'][:]

        target = decode_transcription(target)
        #target = self.tokenizer.encode(target, return_tensors="pt")
        
        return (inputs, target)

    def __getiteminputs__(self,idx):
        total = 0
        for i,count in enumerate(self.session_counts):
            total += count
            if total > idx:
                break
        file = self.files[i]
        index = idx - total + count

        with h5py.File(file, 'r') as f:
            inputs = torch.from_numpy(f[list(f.keys())[index]]['input_features'][:])
       
        return inputs 
    

# Custom collation function to handle variable seq lengths
def b2tCollate_fn(batch):
    inputs, targets = zip(*batch)

    max_input_len = max([inp.shape[0] for inp in inputs])
    max_target_len = max([target.shape[1] for target in targets])

    # Compute attention-masks
    attn_masks = [torch.ones(inp.shape[0]) for inp in inputs]
    attn_masks = [F.pad(mask, (0,max_input_len-mask.shape[0])) for mask in attn_masks]
    attn_masks = torch.stack(attn_masks)
    
    # Pad and stack the input signals
    inputs = [F.pad(inp, (0,0,0,max_input_len-inp.shape[0])) for inp in inputs]
    inputs = torch.stack(inputs)
    
    # Pad and stack the target sentences
    targets = [F.pad(target, (0,max_target_len-target.shape[1])) for target in targets]
    targets = torch.stack(targets).squeeze()

    return (inputs, attn_masks, targets)


2025-08-05 15:53:20.644985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754409200.867223      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754409200.933634      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Generate datasets used to create the graphs.

What questions are interesting for variance?
1. Variance across the whole dataset in terms of length
2. Variance across each file (day)
3. Variance in terms of particpant (is this the same as the day?)
   

In [6]:
import numpy as np

training_data = b2tDataset(sets="train")
lengths_vs_target = []
for i in range(training_data.__len__()):
    #look at dataloader, see if it's faster
    input_features,target = training_data.__getitem__(i)
    lengths_vs_target.append((input_features.shape[0], target))

lengths_arr = np.array([], dtype=int)
targets_arr = np.array([], dtype=str)
targets_length_arr = np.array([], dtype=int)
for i in lengths_vs_target:
    lengths_arr = np.append(lengths_arr,int(i[0]))
    targets_arr = np.append(targets_arr,str(i[1]))
    targets_length_arr = np.append(targets_length_arr,len(i[1]))

Processing sessions:   0%|          | 0/45 [00:00<?, ?it/s]

In [7]:
print("Mean timeseries length: " + str(lengths_arr.mean()))
print("Max timeseries length: " + str(lengths_arr.max()))
print("Min timeseries length: " + str(lengths_arr.min()))
print("Variance across all timeseries: " + str(np.var(lengths_arr))) #change to stdev

Mean timeseries length: 874.8405599603568
Max timeseries length: 2475
Min timeseries length: 138
Variance across all timeseries: 95035.90309720873


Variance is high!

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr

corr, _ = pearsonr(lengths_arr, targets_length_arr)

plt.scatter(lengths_arr, targets_length_arr)
plt.title(f"Scatter plot showing correlation between timeseries length and target length with correlation = {corr:.2f}")
plt.xlabel("Timeseries Length")
plt.ylabel("Target Length")
plt.grid(True)
plt.show()

This shows that there is a fairly strong correlation between timeseries length and target sentence length, with the Pearson's Correlation Coefficient of 0.71. But you may be wondering, what are those outliers? What are the long sentences? And which sentences relate to the longest timeseries? Let's take a look at that.

In [None]:
print("Sentences with Length Greater than 80 Characters: \n")
count = 0

for i in lengths_vs_target:
    if len(i[1]) > 80:
        print("Sentence " + str(count) + ": " + i[1])
        count += 1

Interestingly, the sentences above are all testing sentences it seems, potentially before the participant was ready. Should we throw these out? I feel like there's a chance they will throw off the data.

One thing I am intrigued by is whether the longest timeseries length sentences relate to more complex sentences. Let's look at the longest timeseries.

In [None]:
print("Sentences with Timeseries Length greater than 2000")

count = 0
for i in lengths_vs_target:
    if int(i[0]) > 2000:
        print("Sentence " + str(count) + ": " + i[1] + " Timeseries length: " + str(i[0]))
        count += 1

Interestingly, some of these sentences seem to relate to complex ideas: "This is not a suitable method to measure and rank the value of coins." and "The Dallas Cowboys are going to have a problem if their quarterback gets hurt." while some seem to relate to very simple things "I like how they tell it." and "If they work at it a little bit." Note to check in on how the study was conducted, how did they decide how long to pull in timeseries data? The brain likely doesn't just go blank once the thought is complete, so was it up to the study director to decide when to stop recording?

In [None]:
print("Sentences with Timeseries Length less than 300: \n")

count = 0
for i in lengths_vs_target:
    if int(i[0]) < 300:
        print("Sentence " + str(count) + ": " + i[1] + " Timeseries length: " + i[0])
        count += 1

What is the meaning of those [DO NOTHING] and [RIGHT HAND - CLOSE]? And what is up with "What's his whim to decide it should be two months?" That seems to be a strangely long sentence for the timeseries length, as well as being a strange phrasing. It doesn't seem like incredibly natural speech to me, might need to throw that piece of data out too.

Might need to tweak the model so that it relies more on the single words than the context, because the sentence correlations are not strong. 