## make_stim_resp.ipynb
This notebook creates the stim and resp matrices that serve as our STRF model inputs. It also separates them into training and validation sets according to sentence ID: 80% of our sentences serve as a training set, while the remaining 20% are held out for validation. This data is saved out to an hdf5 file for access by `fit_mtrf.ipynb`, the notebook where we fit our models.

Each hdf5 file corresponds to an individual subject with the keys corresponding to a different portion of the dataset: `['tStim','tResp','vStim','vResp']`:

* `tStim`, `vStim`: shape of (samps x features). These arrays are _not_ delayed. To make them delayed, use the function `mtrf_utils.make_delayed()`.
* `tResp`, `vResp`: shape of (samps x features). These arrays are _not_ z-scored. To z-score them, use the function `mtrf_utils.z_score()`.

You will need to index these matrices according to which model you are fitting, as they contain the greatest common factor of features in all models (i.e., the features of `model2`, so if you're fitting `model2` then no extra indexing is necessary).

In [2]:
# Paths - Update locally!
git_path = '/path/to/git/kurteff2024_code/'
data_path = '/path/to/bids/dataset/'

In [1]:
# data loading / manip
import mne
import re
import numpy as np
import os
import csv
import h5py
import warnings
from tqdm.notebook import tqdm
import sys
sys.path.append(os.path.join(git_path,"preprocessing","events","textgrids"))
import textgrid

In [3]:
subjs = [s for s in os.listdir(
    os.path.join(git_path,"preprocessing","events","csv")) if "TCH" in s or "S0" in s]
exclude = ["TCH8"]
no_imaging = ["S0010"]
subjs = [s for s in subjs if s not in exclude]

blocks = {
    s: [
        b.split("_")[-1] for b in os.listdir(os.path.join(
            git_path,"preprocessing","events","csv",s)) if f"{s}_B" in b and os.path.isfile(os.path.join(
            git_path,"preprocessing","events","csv",s,b,f"{b}_spkr_sn_all.txt"
        ))
    ] for s in subjs
}

In [4]:
features_dict = {
                'dorsal': ['y','w','k','kcl', 'g','gcl','eng','ng'],
                'coronal': ['ch','jh','sh','zh','s','z','t','tcl','d','dcl','n','th','dh','l','r'],
                'labial': ['f','v','p','pcl','b','bcl','m','em','w'],
                'high': ['uh','ux','uw','iy','ih','ix','ey','eh','oy'],
                'front': ['iy','ih','ix','ey','eh','ae','ay'],
                'low': ['aa','ao','ah','ax','ae','aw','ay','axr','ow','oy'],
                'back': ['aa','ao','ow','ah','ax','ax-h','uh','ux','uw','axr','aw'],
                'plosive': ['p','pcl','t','tcl','k','kcl','b','bcl','d','dcl','g','gcl','q'],
                'fricative': ['f','v','th','dh','s','sh','z','zh','hh','hv','ch','jh'],
                'syllabic': ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay','eh','ey','ih', 'ix', 'iy','ow', 'oy','uh', 'uw', 'ux'],
                'nasal': ['m','em','n','en','ng','eng','nx'],
                'voiced':   ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay','eh','ey','ih', 'ix', 'iy','ow', 'oy','uh', 'uw', 'ux','w','y','el','l','r','dh','z','v','b','bcl','d','dcl','g','gcl','m','em','n','en','eng','ng','nx','q','jh','zh'],
                'obstruent': ['b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx','f', 'g', 'gcl', 'hh', 'hv','jh', 'k', 'kcl', 'p', 'pcl', 'q', 's', 'sh','t', 'tcl', 'th','v','z', 'zh','q'],
                'sonorant': ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay','eh','ey','ih', 'ix', 'iy','ow', 'oy','uh', 'uw', 'ux','w','y','el','l','r','m', 'n', 'ng', 'eng', 'nx','en','em'],
        }
features = [f for f in features_dict.keys()]
print(features)
print(len(features))

['dorsal', 'coronal', 'labial', 'high', 'front', 'low', 'back', 'plosive', 'fricative', 'syllabic', 'nasal', 'voiced', 'obstruent', 'sonorant']
14


### Step 1. Load raw

In [5]:
raws, ch_names = dict(), dict()
for s in tqdm(subjs):
    raws[s] = dict()
    for b in blocks[s]:
        blockid = "_".join([s,b])
        raw_fpath = os.path.join(data_path,f"sub-{s}",b,blockid,"HilbAA_70to150_8band",
                                 "ecog_hilbAA70to150.fif")
        raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
        ch_names[s] = raws[s][b].info['ch_names']

  0%|          | 0/16 [00:00<?, ?it/s]

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  return str(d.tostring().decode('latin1', 'ignore'))
  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')]

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring

  raws[s][b] = mne.io.read_raw_fif(raw_fpath,preload=True,verbose=False)
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  ch_name = ch_name[:np.argmax(ch_name == 

  ch_name = ch_name[:np.argmax(ch_name == b'')].tostring()
  return str(d.tostring().decode('latin1', 'ignore'))


### Step 2. Read events from eventfiles

In [6]:
spkr_events, mic_events = dict(), dict()
for s in tqdm(subjs):
    fs = raws[s][list(raws[s].keys())[0]].info['sfreq']
    spkr_events[s], mic_events[s] = dict(), dict()
    for b in blocks[s]:
        blockid = "_".join([s,b])
        # Spkr events
        event_fpath = os.path.join(git_path,"preprocessing","events","csv",s,blockid,
                                   f"{blockid}_spkr_ph_all.txt")
        event_file = []
        with open(event_fpath,'r') as f:
            c = csv.reader(f, delimiter="\t")
            for row in c:
                event_file.append(row[:3]+row[4:])
        event_file = np.array(event_file,dtype=float)
        event_file[:,:2] = np.round(event_file[:,:2]*fs)
        spkr_events[s][b] = event_file.astype(int)
        # Mic events (need non-task times removed)
        event_fpath = os.path.join(git_path,"analysis","events","csv",s,blockid,
                                   f"{blockid}_mic_ph_all.txt")
        event_file = []
        with open(event_fpath,'r') as f:
            c = csv.reader(f,delimiter='\t')
            for row in c:
                event_file.append(row[:3]+row[4:])
        event_file = np.array(event_file,dtype=float)
        event_file[:,:2] = np.round(event_file[:,:2]*fs)
        mic_tg_path = os.path.join(git_path,"", s, blockid, [f for f in os.listdir(os.path.join(
            git_path,"preprocessing","events","textgrids",s,blockid
        )) if f'{blockid}_mic.textgrid'.lower() in f.lower()][0])
        with open(mic_tg_path) as r:
            tg = textgrid.TextGrid(r.read())
        for row in tg.tiers[2].simple_transcript:
            if row[2] == 'task':
                task_times.append([np.float(row[0]), np.float(row[1])])
        # Convert it to an array
        task_times = np.array(np.array(task_times)*fs).astype(int)
        task_range = [np.arange(t[0],t[1],step=1) for t in task_times]
        trial_events = []
        for d in task_range:
            for ev in event_file:
                onset = ev[0]
                offset = ev[1]
                if onset in d and offset in d:
                    trial_events.append(ev)
        mic_events[s][b] = np.array(trial_events).astype(int)

  0%|          | 0/16 [00:00<?, ?it/s]

### Step 3. Make `stim` and `resp` matrix
Array shapes:

* `stim`: samps x feats
* `resp`: chs x samps

In [None]:
        # Perception, consistent
        el_sh_stim = np.zeros((2, nsamps))
        el_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_mic_sn_el.txt"), delimiter="\t", dtype=str)
        # If statement for blocks that don't have el condition
        if len(el_evs) > 0:
            mic_el_onsets = (el_evs[:,0].astype(float)*fs).astype(int)
            mic_el_offsets = (el_evs[:,1].astype(float)*fs).astype(int)
        else:
            mic_el_onsets, mic_el_offsets = [], []
        mic_el_times = np.vstack((mic_el_onsets,mic_el_offsets)).T
        # Perception, inconsistent
        sh_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_mic_sn_sh.txt"), delimiter="\t", dtype=str)
        # If statement for blocks that don't have sh condition (e.g., S0023_B2)
        if len(sh_evs) > 0:
            mic_sh_onsets = (sh_evs[:,0].astype(float)*fs).astype(int)
            mic_sh_offsets = (sh_evs[:,1].astype(float)*fs).astype(int)
        else:
            mic_sh_onsets, mic_sh_offsets = [], []
        mic_sh_times = np.vstack((mic_sh_onsets,mic_sh_offsets)).T

In [7]:
stims, resps = dict(), dict()
phonemes = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv","phonemes.txt"),delimiter="\n")
nphones = len(phonemes); nfeats = len(features)
for s in tqdm(subjs):
    stims[s], resps[s] = dict(), dict()
    for b in blocks[s]:
        blockid = "_".join([s,b])
        # Make the resp
        resps[s][b] = raws[s][b].get_data(); nsamps = resps[s][b].shape[1]
        # Make spkr stim
        phn_stim_spkr = np.zeros((nphones, nsamps)); feat_stim_spkr = np.zeros((nfeats, nsamps))
        # Get timing of consistent (el) and inconsistent (sh) playback
        el_sh_stim = np.zeros((2, nsamps))
        # Perception, consistent
        el_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_spkr_sn_el.txt"), delimiter="\t", dtype=str)
        if len(el_evs) > 0:
            spkr_el_onsets = (el_evs[:,0].astype(float)*fs).astype(int)
            spkr_el_offsets = (el_evs[:,1].astype(float)*fs).astype(int)
        else:
            spkr_el_onsets, spkr_el_offsets = [], []
        spkr_el_times = np.vstack((spkr_el_onsets,spkr_el_offsets)).T
        # Perception, inconsistent
        sh_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_spkr_sn_sh.txt"), delimiter="\t", dtype=str)
        if len(sh_evs) > 0:
            spkr_sh_onsets = (sh_evs[:,0].astype(float)*fs).astype(int)
            spkr_sh_offsets = (sh_evs[:,1].astype(float)*fs).astype(int)
        else:
            spkr_sh_onsets, spkr_sh_offsets = [], []
        spkr_sh_times = np.vstack((spkr_sh_onsets,spkr_sh_offsets)).T
        # Production, consistent
        el_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_mic_sn_el.txt"), delimiter="\t", dtype=str)
        if len(el_evs) > 0:
            mic_el_onsets = (el_evs[:,0].astype(float)*fs).astype(int)
            mic_el_offsets = (el_evs[:,1].astype(float)*fs).astype(int)
        else:
            mic_el_onsets, mic_el_offsets = [], []
        mic_el_times = np.vstack((mic_el_onsets,mic_el_offsets)).T
        # Production, inconsistent
        sh_evs = np.loadtxt(os.path.join(git_path,"preprocessing","events","csv",s,
                                         blockid,f"{blockid}_mic_sn_sh.txt"), delimiter="\t", dtype=str)
        if len(sh_evs) > 0:
            mic_sh_onsets = (sh_evs[:,0].astype(float)*fs).astype(int)
            mic_sh_offsets = (sh_evs[:,1].astype(float)*fs).astype(int)
        else:
            mic_sh_onsets, mic_sh_offsets = [], []
        mic_sh_times = np.vstack((mic_sh_onsets,mic_sh_offsets)).T
        for ev in spkr_events[s][b]:
            onset = ev[0]
            for el_time in spkr_el_times:
                el_sh_stim[0,onset] = 1 if onset >= el_time[0] and onset <= el_time[1]
            for sh_time in spkr_sh_times:
                el_sh_stim[1,onset] = 1 if onset >= sh_time[0] and onset <= sh_time[1]
            phn_label = ev[2]; phn_stim_spkr[phn_label,onset] = 1
            phn_stripped = re.sub(r'\d+', '', phonemes[phn_label].lower())
            for fi, f in enumerate(features):
                feat_stim_spkr[fi, onset] = 1 if phn_stripped in features_dict[f]
        # Make mic stim
        phn_stim_mic = np.zeros((nphones, nsamps)); feat_stim_mic = np.zeros((nfeats, nsamps))
        for ev in mic_events[s][b]:
            onset = ev[0]
            for el_time in mic_el_times:
                el_sh_stim[0,onset] = 1 if onset >= el_time[0] and onset <= el_time[1]
            for sh_time in mic_sh_times:
                el_sh_stim[1,onset] = 1 if onset >= sh_time[0] and onset <= sh_time[1]
            phn_label = ev[2]; phn_stim_mic[phn_label,onset] = 1
            phn_stripped = re.sub(r'\d+', '', phonemes[phn_label].lower())
            for fi, f in enumerate(features):
                feat_stim_mic[fi, onset] = 1 if phn_stripped in features_dict[f]
        # Concatenate stimulus features
        stims[s][b] = np.vstack(((feat_stim_spkr + feat_stim_mic), feat_stim_spkr, feat_stim_mic,
            np.atleast_2d(phn_stim_spkr.sum(0)), np.atleast_2d(phn_stim_mic.sum(0)), el_sh_stim)).T

  0%|          | 0/16 [00:00<?, ?it/s]

  sh_evs = np.loadtxt(spkr_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(mic_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(spkr_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(mic_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(spkr_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(mic_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(spkr_sh, dtype=str, delimiter="\t")
  sh_evs = np.loadtxt(mic_sh, dtype=str, delimiter="\t")


### Step 4. Split stim and resp into training and validation sets
**This step takes the longest.**

We need a stimulus matrix that is split by sentence ID to avoid double dipping. We can take the sentence IDs/timing from the sentence event files and compare timing to the phoneme event files to match individual phonemes wtih the sentence IDs. Then we can split different sentence IDs into training/validation dicts.

If you plan on regenerating hdf5 files locally, you'll need to update this cell with your desired paths. When running this notebook on Linux I save them locally to my git folder (but don't push them).

Array shapes:

* `tStim`, `vStim`: samps x feats (transposition of `stim` in the last step)
* `tResp`, `vResp`: samps x chs

In [8]:
random_seed = 6655321; force_overwrite = False
tStims, vStims, tResps, vResps = dict(), dict(), dict(), dict()
for s in tqdm(subjs):
    # Update this path if you're saving/loading h5 files locally
    h5_fpath = os.path.join(git_path, "analysis", "mtrf", "h5", "model_inputs", f"{s}_model_inputs.hdf5")
    if not os.path.isfile(h5_fpath) or force_overwrite:
        print(f"Splitting stim/resp into training/validation sets for {s}")
        tStims[s], vStims[s], tResps[s], vResps[s] = dict(), dict(), dict(), dict()
        for b in blocks[s]:
            blockid = "_".join([s,b])
            # Read event files
            onsets, offsets, ids = [], [], []
            # Read event files (spkr)
            spkr_sn_ev_fpath = os.path.join(git_path, "preprocessing", "events", "csv", s, blockid,
                                            f"{blockid}_spkr_sn_all.txt")
            with open(spkr_sn_ev_fpath,'r') as f:
                c = csv.reader(f, delimiter="\t")
                for row in c:
                    onsets.append(int(float(row[0])*fs)); offsets.append(int(float(row[1])*fs))
                    ids.append(int(row[2]))
            # Read event files (mic)
            mic_sn_ev_fpath = os.path.join(git_path, "preprocessing", "events", "csv", s, blockid,
                                           f"{blockid}_mic_sn_all.txt")
            with open(mic_sn_ev_fpath,'r') as f:
                c = csv.reader(f,delimiter='\t')
                for row in c:
                    onsets.append(int(float(row[0])*fs)); offsets.append(int(float(row[1])*fs))
                    ids.append(int(row[2]))
            # Make id relative to the lowest value
            ids = [np.where(np.unique(ids)==i)[0][0] for i in ids]
            # Split events sentence-by-sentence
            sn_events = dict()
            for this_sentence in range(len(np.unique(ids))):
                sn_ranges = []
                for i, sn_id in enumerate(ids):
                    if sn_id == this_sentence:
                        onset_samp = onsets[i]; offset_samp = offsets[i]
                        sn_ranges.append([onset_samp, offset_samp])
                sn_events[this_sentence] = sn_ranges
            # Split stim/resp sentence-by-sentence
            resp_dict, stim_dict = dict(), dict()
            for this_sentence in range(len(np.unique(ids))):
                sn_resps, sn_stims = [], []
                for i, ev in enumerate(sn_events[this_sentence]):
                    onset = ev[0]; offset = ev[1]
                    for samp_idx in np.arange(resps[s][b].shape[1]):
                        if samp_idx >= onset and samp_idx <= offset:
                            sn_resps.append(resps[s][b][:,samp_idx]); sn_stims.append(stims[s][b][samp_idx])
                resp_dict[this_sentence] = np.array(sn_resps); stim_dict[this_sentence] = np.array(sn_stims)
            # Split stim/resp into training/validation sets along sentence boundaries
            nsentences = np.unique(ids).shape[0]
            tv_split = int(nsentences*0.8)
            np.random.seed(random_seed); train_sn_ids = np.random.permutation(nsentences)[:tv_split]
            np.random.seed(random_seed; val_sn_ids = np.random.permutation(nsentences)[tv_split:]
            tStims_by_sn, vStims_by_sn, tResps_by_sn, vResps_by_sn = dict(), dict(), dict(), dict()
            for this_sentence in train_sn_ids:
                tResps_by_sn[this_sentence] = resp_dict[this_sentence]
                tStims_by_sn[this_sentence] = stim_dict[this_sentence]
            for this_sentence in val_sn_ids:
                vResps_by_sn[this_sentence] = resp_dict[this_sentence]
                vStims_by_sn[this_sentence] = stim_dict[this_sentence]
            tStims[s][b] = np.vstack(list(tStims_by_sn.values()))
            vStims[s][b] = np.vstack(list(vStims_by_sn.values()))
            tResps[s][b] = np.vstack(list(tResps_by_sn.values()))
            vResps[s][b] = np.vstack(list(vResps_by_sn.values()))
            print(f"""{blockid}: training on {tStims[s][b].shape[0]} samples,
            validating on {vStims[s][b].shape[0]} samples.
            Raw contained {stims[s][b].shape[0]} samples.""")
            if tStims[s][b].shape[0] != tResps[s][b].shape[0]:
                raise Exception("Stim and resp do not have the same shape! (training)")
            if vStims[s][b].shape[0] != vResps[s][b].shape[0]:
                raise Exception("Stim and resp do not have the same shape! (validation)")
        # Concatenate across blocks
        all_tStim = np.vstack(list(tStims[s].values())); all_tResp = np.vstack(list(tResps[s].values()))
        all_vStim = np.vstack(list(vStims[s].values())); all_vResp = np.vstack(list(vResps[s].values()))
        # Save the split to hdf5 file (and upload to box!)
        with h5py.File(h5_fpath,'a') as f:
            f.create_dataset('tStim', data=all_tStim); f.create_dataset('tResp', data=all_tResp)
            f.create_dataset('vStim', data=all_vStim); f.create_dataset('vResp', data=all_vResp)
    else:
        print(f"Stim/resp for {s} already split, skipping this subject...")

  0%|          | 0/16 [00:00<?, ?it/s]

Splitting stim/resp into training/validation sets for S0004
S0004_B9: training on 16229 samples,
            validating on 4066 samples.
            Raw contained 51112 samples.
S0004_B10: training on 15486 samples,
            validating on 4704 samples.
            Raw contained 51036 samples.
S0004_B14: training on 18187 samples,
            validating on 4767 samples.
            Raw contained 51053 samples.
S0004_B3: training on 16300 samples,
            validating on 3958 samples.
            Raw contained 63787 samples.
S0004_B12: training on 16504 samples,
            validating on 4651 samples.
            Raw contained 48117 samples.
Splitting stim/resp into training/validation sets for S0006
S0006_B5: training on 16428 samples,
            validating on 5056 samples.
            Raw contained 54249 samples.
S0006_B11: training on 15061 samples,
            validating on 3903 samples.
            Raw contained 46255 samples.
S0006_B2: training on 16906 samples,
            v