## Bootstrap linear encoding models
**Warning: this notebook is very computationally intensive.** You may wish to run it on your lab server instead of a local machine. Another alternative is to split the subjects up into smaller subsets, then run those subsets instead of holding all the subjects' models in memory at the same time.

In [None]:
from tqdm.notebook import tqdm
import h5py
import pickle
import numpy as np
import random
import itertools as itools
import pandas as pd
import sys
from glob import glob
sys.path.append('../preprocessing/utils/')
from ridge import eigridge_corr

In [None]:
# Change these path for running the notebook locally
eeg_data_path = '/path/to/dataset/' # downloadable from OSF: https://doi.org/10.17605/OSF.IO/FNRD9
git_path  = '/path/to/git/speaker_induced_suppression_EEG/'
# Where the output of train_linear_model.ipynb is saved. Run that first if you haven't already.
h5_path = '/path/to/h5/'

In [None]:
models = ['model1']
tmin, tmax = -0.3, 0.5
nboots_shuffle = 100
exclude = ['OP0001','OP0002','OP0004','OP0017','OP0020']
subjs = np.sort([s[-6:] for s in glob(f'{git_path}eventfiles/*') if 'OP0' in s and s[-6:] not in exclude])
delays = np.arange(np.floor(tmin*128),np.ceil(tmax*128),dtype=int)
chunklen = len(delays)*3 # data randomized in chunks

In [None]:
# Load stim/resp from hdf5
tStims, tResps, vStims, vResps = dict(), dict(), dict(), dict()
pbar = tqdm(subjs)
for s in pbar:
    pbar.set_description(f"Loading model inputs for {s} {model_number}")
    # Update this file location accordingly on your local machine!
    model_input_h5_fpath = f"{h5_path}{s}_model_inputs.hdf5"
    tStims[s], tResps[s], vStims[s], vResps[s] = strf.load_model_inputs(model_input_h5_fpath, model_number)
    print(s, model_number, "t/v stim:", tStims[s].shape, vStims[s].shape, "||",
          "t/v resp:", tResps[s].shape, vResps[s].shape)
# Load corrs, best alphas from pandas
corrs, best_alphas = dict(), dict()
results_csv_fpath = f"{git_path}stats/lme_results.csv"
df = pd.read_csv(results_csv_fpath)
pbar = tqdm(subjs)
for s in pbar:
    pbar.set_description(f"Loading model outputs for {s} {model_number}")
    blockid = f"{s}_B1"
    ch_names = mne.io.read_raw_brainvision(f"{eeg_data_path}{s}/{blockid}/{blockid}_cca.vhdr",
                                           preload=False,verbose=False).info['ch_names']
    subj_corrs, subj_best_alphas = np.zeros(len(ch_names)), np.zeros(len(ch_names))
    for i,ch in enumerate(ch_names):
        tgt_row = df[(df['subject']==s) & (df['model']==model_number) & (df['channel']==ch)]
        subj_corrs[i] = df.loc[tgt_row.index, 'r_value']
        subj_best_alphas[i] = df.loc[tgt_row.index, 'best_alpha']
    corrs[s] = np.array(subj_corrs)
    best_alphas[s] = np.array(subj_best_alphas) 

In [None]:
# Bootstrap data
pvals = dict()
nboots_shuffle = 100
chunklen = len(delays)*3 # data randomized in chunks
for s in subjs:
    nsamps, nelecs = tResps[s].shape
    allinds = range(nsamps)
    nchunks = int(np.floor(0.2*nsamps/chunklen))
    boot_corrs = []
    # Run the bootstrap
    pbar = tqdm(np.arange(nboots_shuffle))
    for n in pbar:
        pbar.set_description(f'{s} {model_number} Bootstrap {n}/{nboots_shuffle}')
        indchunks = list(zip(*[iter(allinds)]*chunklen))
        random.shuffle(indchunks)
        shuff_inds = list(itools.chain(*indchunks[:nchunks]))
        tStim_shuff = tStims[s].copy()
        tResp_shuff = tResps[s].copy()
        tStim_shuff = tStim_shuff[shuff_inds,:]
        tResp_shuff = tResp_shuff[:len(shuff_inds),:]
        boot_corr = eigridge_corr(tStim_shuff, vStims[s], tResp_shuff, vResps[s],
                             [best_alphas[s][0]], corrmin = 0.05)
        boot_corrs.append(boot_corr)
    boot_corrs = np.vstack((boot_corrs))
    # Compare bootstrap coors to STRF corrs
    # Is the correlation of the model greater than the shuffled correlation for random data?
    strf_corrs = corrs[s]
    h_val = np.array([strf_corrs > boot_corrs[c] for c in np.arange(len(boot_corrs))])
    print(h_val.shape) # Should be nboots x nchans
    # Count the number of times out of nboots_shuffle that the correlation is greater than 
    # random, subtract from 1 to get the bootstrapped p_val (one per electrode)
    pvals[s] = 1-h_val.sum(0)/nboots_shuffle

In [None]:
# Save bootstrap results to csv
df = pd.read_csv(results_csv_fpath)
pbar = tqdm(subjs)
for s in pbar:
    pbar.set_description(f"Saving pvals for {s} {model_number} to csv")
    blockid = f"{s}_B1"
    ch_names = mne.io.read_raw_brainvision(f"{eeg_data_path}{s}/{blockid}/{blockid}_cca.vhdr",
                                           preload=False,verbose=False).info['ch_names']
    for i,ch in enumerate(ch_names):
        tgt_row = df[(df['subject']==s) & (df['model']==model_number) & (df['channel']==ch)]
        df.loc[tgt_row.index, 'p_value'] = pvals[s][i]
df.to_csv(results_csv_fpath,index=False)