# 4. Speech-selective model. 

## Goals.

1. Make new design matrices, conditioned on the presence of speech.
2. Generate two predictions, using the intial parameters estimated by the CSS model for both the speech and nonspeech design matrices.
3. Find the linear combination of these parameter combinations that best explain the data.
4. Use these beta-weights to test the predictions on the left out data. 

### Imports

In [None]:
%load_ext autoreload
%autoreload 2
import h5py
from funcs import h5_make, convolve, create_hrf, HCP_subject, masked_vert, h5_dump2
from cfhcpy.base import AnalysisBase
from prfpy.stimulus import PRFStimulus1Dn
from prfpy.model import CSS_Iso1DGaussianModel
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

### Read in data, as before

In [2]:
late = AnalysisBase()

late.startup(subject='late', experiment_id='movie', yaml_file='/tank/hedger/software/hcp_movie/config.yml')


late.subject_base_dir='/tank/hedger/DATA/HCP_temp/late'

latesub=HCP_subject(late)

latesub.prep_data()

dtype='main' # Use the independent data, thereby chopping off the 'test sequence'
standardise=True # Standardise the design matrix
zaxis=1
filt=False # Dont filter the design matrix,we will filter the predictions instead.

latesub.import_data(dtype,standardise,filt,zaxis)

Starting analysis of subject late on romulus with settings 
{
 "identifier": "node230",
 "base_dir": "/scratch/2019/visual/hcp_{experiment}/",
 "code_dir": "/tank/hedger/scripts/HCP_tonotopy",
 "threads": 40
}


Prompt : After ref date? 1=True or 0=False 1


  0%|          | 0/4 [00:00<?, ?it/s]

Reading in data


100%|██████████| 4/4 [00:06<00:00,  1.69s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Creating design matrices



  0%|          | 0/921 [00:00<?, ?it/s][A
100%|██████████| 921/921 [00:00<00:00, 7036.04it/s][A

  0%|          | 0/921 [00:00<?, ?it/s][A
100%|██████████| 921/921 [00:00<00:00, 7059.35it/s][A
 25%|██▌       | 1/4 [00:02<00:08,  2.67s/it]
  0%|          | 0/918 [00:00<?, ?it/s][A
100%|██████████| 918/918 [00:00<00:00, 7203.14it/s][A

  0%|          | 0/918 [00:00<?, ?it/s][A
100%|██████████| 918/918 [00:00<00:00, 7162.59it/s][A
 50%|█████     | 2/4 [00:05<00:05,  2.66s/it]
  0%|          | 0/915 [00:00<?, ?it/s][A
100%|██████████| 915/915 [00:00<00:00, 7213.26it/s][A

  0%|          | 0/915 [00:00<?, ?it/s][A
100%|██████████| 915/915 [00:00<00:00, 7126.00it/s][A
 75%|███████▌  | 3/4 [00:07<00:02,  2.66s/it]
  0%|          | 0/901 [00:00<?, ?it/s][A
100%|██████████| 901/901 [00:00<00:00, 7105.92it/s][A

  0%|          | 0/901 [00:00<?, ?it/s][A
100%|██████████| 901/901 [00:00<00:00, 7287.27it/s][A
100%|██████████| 4/4 [00:10<00:00,  2.64s/it]
  0%|          | 0/4 [00:00

making training and test folds


100%|██████████| 4/4 [00:02<00:00,  1.96it/s]


### Function for making speech design matrices. 

In [3]:
def make_speech_dm(fold,cut=True,captionpath='/tank/hedger/DATA/HCP_temp/Resources/captions'):
    
    mylist=sorted(os.listdir(captionpath)) # Path to the speech csvs.
    
    frame=pd.read_csv(os.path.join(captionpath,mylist[fold]),delimiter=';')
    
    frame['Startsecs']=frame['Start time in milliseconds']/1000 # Convert to seconds
    frame['Endsecs']=frame['End time in milliseconds']/1000
    
    starts=np.array(frame['Startsecs'].astype(int)) 
    ends=np.array(frame['Endsecs'].astype(int))
    event=np.zeros(latesub.ab.experiment_dict['run_durations'][fold])
    
    # Loop through the instances of speech as defined in the CSV 
    for i in range(starts.shape[0]):
        event[starts[i]:ends[i]]=1 # Code instances of speech as 1.
    if cut:
        event=event[:-latesub.ab.experiment_dict['test_duration']]
    other=1-event
    other[latesub.dm_test[fold][0]==0]=0
        
    return empty,other

### Construct all speech and nonspeech design matrices, organize them into the same fold combinations.

In [4]:
def make_speech_dms(sub):
    
    speechdms_train,nspeechdms_train=[],[]
    speechdms_test,nspeechdms_test=[],[]

    # Go through each fold
    for i in range(4):

        # Make the design martices for each fold train and test. 
        dmtrain=np.hstack([make_speech_dm(t) for t in sub.folds[i]])
        dmtest=make_speech_dm(i)
    
        # An intial step is to ensure that periods of silence remains silent.
        # We so far defined the non-speech design matrix 1-speech, meaning that silence will be effected.
        tempsp_train,tempnsp_train=np.copy(sub.dm_train[i]),np.copy(sub.dm_train[i])
        tempsp_test,tempnsp_test=np.copy(sub.dm_test[i]),np.copy(sub.dm_test[i])
        tempsp_train[:,dmtrain[0]==0]=0  
        tempnsp_train[:,dmtrain[1]==0]=0
        tempsp_test[:,dmtest[0]==0]=0
        tempnsp_test[:,dmtest[1]==0]=0
    
    
        speechdms_train.append(tempsp_train)
        nspeechdms_train.append(tempnsp_train)
    
        speechdms_test.append(tempsp_test)
        nspeechdms_test.append(tempnsp_test)
        
    sub.speechdms_train=speechdms_train
    sub.nspeechdms_train=nspeechdms_train
    sub.speechdms_test=speechdms_test
    sub.nspeechdms_test=nspeechdms_test
    
    return

In [5]:
make_speech_dms(latesub)

### Now Load in the CSS parameters estimated in notebook 2.

In [8]:
# Load in the Mask that defines the tonotopic verties.
frame=pd.read_csv('/tank/hedger/DATA/HCP_temp/OUTPUTS/CSVS/sframe.csv')
mask=np.array(frame['mask'])
nvertices=mask.shape[0]

late=h5_make('/tank/hedger/DATA/HCP_temp/late','AUDITORY_FITS_CSS_FULL')
latef = h5py.File(late, "r")
latef.keys()
late_foldp=np.array(latef['wb_log_fold_params_CSS'])


early=h5_make('/tank/hedger/DATA/HCP_temp/early','AUDITORY_FITS_CSS_FULL')
earlyf = h5py.File(early, "r")
earlyf.keys()
early_foldp=np.array(earlyf['wb_log_fold_params_CSS'])

filename already exists
filename already exists


In [7]:
latef.close()
earlyf.close()

Simple function for asessing the predictions of the speech model against data. 

In [8]:
def test_pred(tseries_raw,speechpred,nspeechpred,betas):
    yhat=betas[-1]+(speechpred*betas[0])+(nspeechpred*betas[1])
    rsq = 1-(yhat-tseries_raw).var(0)/tseries_raw.var(0)
    return yhat,rsq

### Define function for performing the fitting for the speech selective model. 

In [11]:
def fit_speech_model(sub,mask,fold,foldp,log=True):
    
    indices=np.where(mask==1)[0]
    blengths=np.array(sub.ab.experiment_dict['run_durations'])[np.array(sub.folds[fold])]-sub.ab.experiment_dict['test_duration']
    blockarr=np.repeat(sub.folds[fold],blengths)
    if log==True:
        frequencies=np.log(sub.frequencies)
        
    # Make pRF stimulus from each speech and nonspeech design matrix.
    speechstim_train=PRFStimulus1Dn(sub.speechdms_train[fold],frequencies,TR=1,block_inds=blockarr)
    nspeechstim_train=PRFStimulus1Dn(sub.nspeechdms_train[fold],frequencies,TR=1,block_inds=blockarr)
    speechstim_test=PRFStimulus1Dn(sub.speechdms_test[fold],frequencies,TR=1)
    nspeechstim_test=PRFStimulus1Dn(sub.nspeechdms_test[fold],frequencies,TR=1)

    # Use the same filter. 
    fparams = {"window_length":201,"polyorder": 3}

    # Make corresponding prf models.
    speechmod_train=CSS_Iso1DGaussianModel(speechstim_train,normalise_RFs=False,filter_predictions=True,filter_type='sg',filter_params=fparams)
    speechmod_train.func='cart'

    nspeechmod_train=CSS_Iso1DGaussianModel(nspeechstim_train,normalise_RFs=False,filter_predictions=True,filter_type='sg',filter_params=fparams)
    nspeechmod_train.func='cart'

    speechmod_test=CSS_Iso1DGaussianModel(speechstim_test,normalise_RFs=False,filter_predictions=True,filter_type='sg',filter_params=fparams)
    speechmod_test.func='cart'

    nspeechmod_test=CSS_Iso1DGaussianModel(nspeechstim_test,normalise_RFs=False,filter_predictions=True,filter_type='sg',filter_params=fparams)
    nspeechmod_test.func='cart'

    # Make the predictions based on CSS model for each design matrix
    speechpreds=[speechmod_train.return_prediction(*list(foldp[fold,index,:][:-1]))[0] for index in indices]
    nspeechpreds=[nspeechmod_train.return_prediction(*list(foldp[fold,index,:][:-1]))[0] for index in indices]

    speechpreds_test=[speechmod_test.return_prediction(*list(foldp[fold,index,:][:-1]))[0] for index in indices]
    nspeechpreds_test=[nspeechmod_test.return_prediction(*list(foldp[fold,index,:][:-1]))[0] for index in indices]
    
    yhats,betass,rsqs=[],[],[]

    from tqdm import tqdm
    
    # Now do some fitting
    i=0
    for index in tqdm(indices):
        
        # Make design matrix (speech prediction, nspeech prediction, intercept).
        dm=np.vstack([speechpreds[i],nspeechpreds[i],np.repeat(1,nspeechpreds[i].shape)])
        
        # Get the data
        tseries_raw=sub.data_train[fold][:,index]
        tseries_raw=np.nan_to_num(tseries_raw)
        dm=np.nan_to_num(dm)
        
        # Solve the regression equation.
        betas, _, _, _ = np.linalg.lstsq(dm.T, tseries_raw.T)
        yhat = np.dot(betas.T, dm)
        rsq = 1-(yhat-tseries_raw).var(0)/tseries_raw.var(0)
    
        # Save the betas.
        rsqs.append(rsq)
        yhats.append(yhat)
        betass.append(betas)
        i=i+1
    
    # Make into array the same shape as the full dataset. 
    R2=masked_vert(np.array(rsqs),mask)

    betas=np.array(betass)
    
    # Test the predictions on the left out data.
    i=0
    perf=[]
    for index in tqdm(indices):
        
        # We derive the out of sample predicitons from the linear combination of speech and nonspeech predictions
        # that we just estimated.
        
        # We test such predictions on the left-out data to apply the same cross-validation strategy as for the CSS model.
        res=test_pred(sub.data_test[fold][:,index],speechpreds_test[i],nspeechpreds_test[i],betass[i])
        
        perf.append(res[1])
        i=i+1
        
    xval=masked_vert(np.array(perf),mask)
    
    # Save out the R2, out of sample performance and beta weights.
    return R2,xval,betas 
    

### Perform the fitting.

In [None]:
late_sms=[]

for i in tqdm(range(4)):
    late_sms.append(fit_speech_model(latesub,mask,i,late_foldp))

### Save out the outcomes

In [13]:
late_diff=np.array([late_sms[i][1]-late_foldp[i,:,-1] for i in range(4)])
late_sp_xval=np.array([late_sms[i][1] for i in range(4)])
late_sp_betas=np.array([masked_vert(late_sms[i][2][:,0],mask) for i in range(4)])
late_nsp_betas=np.array([masked_vert(late_sms[i][2][:,1],mask) for i in range(4)])
late_intercepts=np.array([masked_vert(late_sms[i][2][:,2],mask) for i in range(4)])

In [17]:
late=h5_make('/tank/hedger/DATA/HCP_temp/late','AUDITORY_FITS_CSS_FULL')
h5_dump2(late,late_diff,'ub_log_fold_sp_diff')
h5_dump2(late,late_sp_xval,'ub_log_fold_sp_xval')
h5_dump2(late,late_sp_betas,'ub_log_fold_sp_betas')
h5_dump2(late,late_nsp_betas,'ub_log_fold_nsp_betas')
h5_dump2(late,late_intercepts,'ub_log_fold_intercepts')

filename already exists


'ub_log_fold_intercepts'

### Repeat for the early subject.

In [None]:
early = AnalysisBase()

early.startup(subject='early', experiment_id='movie', yaml_file='/tank/hedger/software/hcp_movie/config.yml')


early.subject_base_dir='/tank/hedger/DATA/HCP_temp/early'

earlysub=HCP_subject(early)

earlysub.prep_data()

dtype='main' # Use the independent data, thereby chopping off the 'test sequence'
standardise=True # Standardise the design matrix
zaxis=1
filt=False # Dont filter the design matrix,we will filter the predictions instead.

earlysub.import_data(dtype,standardise,filt,zaxis)

In [19]:
make_speech_dms(earlysub)

In [None]:
early_sms=[]

for i in tqdm(range(4)):
    early_sms.append(fit_speech_model(earlysub,mask,i,early_foldp))

In [22]:
early_diff=np.array([early_sms[i][1]-early_foldp[i,:,-1] for i in range(4)])
early_sp_xval=np.array([early_sms[i][1] for i in range(4)])
early_sp_betas=np.array([masked_vert(early_sms[i][2][:,0],mask) for i in range(4)])
early_nsp_betas=np.array([masked_vert(early_sms[i][2][:,1],mask) for i in range(4)])
early_intercepts=np.array([masked_vert(early_sms[i][2][:,2],mask) for i in range(4)])

In [23]:
early=h5_make('/tank/hedger/DATA/HCP_temp/early','AUDITORY_FITS_CSS_FULL')

h5_dump2(early,early_diff,'ub_log_fold_sp_diff')
h5_dump2(early,early_sp_xval,'ub_log_fold_sp_xval')
h5_dump2(early,early_sp_betas,'ub_log_fold_sp_betas')
h5_dump2(early,early_nsp_betas,'ub_log_fold_nsp_betas')
h5_dump2(early,early_intercepts,'ub_log_fold_intercepts')

filename already exists


'ub_log_fold_intercepts'

In [72]:
latef.close()

In [73]:
earlyf.close()