# Hands-on Tutorial for Encoding Analysis (Predicting fMRI responses from DNN features)

This notebook provides a hands-on tutorial for encoding analysis, which predicts fMRI responses from DNN features.

## Environment setup

Run the following code block to install Python packages required for this notebook.

In [None]:
!curl -O https://raw.githubusercontent.com/KamitaniLab/feature-encoding/main/requirements.txt
!pip install -r requirements.txt

Import required modules.

In [None]:
import os
from itertools import product

import bdpy
from bdpy.dataform import Features, DecodedFeatures, load_array, save_array
from bdpy.evals.metrics import profile_correlation
from bdpy.ml import ModelTraining, ModelTest
from fastl2lir import FastL2LiR
import matplotlib.pyplot as plt
import numpy as np

## Data preparation

Download the dataset used in this notebook.
Please ignore it if you have already downloaded the dataset.

In [None]:
!mkdir data
!curl -O https://raw.githubusercontent.com/KamitaniLab/feature-encoding/main/data/download.py
!curl -O https://raw.githubusercontent.com/KamitaniLab/feature-encoding/main/data/files.json
!python download.py fmri_deeprecon_sub-03_fmriprep_hcpvc
!python download.py features_imagenet_training_vgg19_random5000
!python download.py features_imagenet_test_vgg19_random5000
!mv fmri data/
!mv features data/

Check the downloaded files.

In [None]:
!ls -la data/

Define data settings.

In [None]:
# Data setting

subject = "sub-03"
rois = ['V1', 'V2', 'V3', 'V4', 'VentralVC']

network = "caffe/VGG19"
layers = ["conv1_2", "fc8"]

training_fmri_path    = f"./data/fmri/Deeprecon/{subject}_ImageNetTraining_fmriprep_volume_native_hcpvc.h5"
training_feature_path = f"./data/features/ImageNetTraining/{network}_random5000"

test_fmri_path    = f"./data/fmri/Deeprecon/{subject}_ImageNetTest_fmriprep_volume_native_hcpvc.h5"
test_feature_path = f"./data/features/ImageNetTest/{network}_random5000"

# Outputs
encoding_model_path = f"./data/feature_encoders/handson/{network}_random5000"
encoded_fmri_path   = f"./data/encoded_fmri/handson/{network}_random5000"

os.makedirs(encoding_model_path, exist_ok=True)
os.makedirs(encoded_fmri_path, exist_ok=True)

## Training of encoding models

Define parameters of the prediction model.
We use [FastL2LiR](https://github.com/KamitaniLab/PyFastL2LiR), fast ridge regression with relevant feature selection, as the prediction model.
The model have two parameters.

- `alpha`: Regularization parameter for ridge regression.
- `num_feat`: Number of relevant features selected by the model.

In [None]:
# Parameters of the encoding model

# L2 regularization parameter
alpha = 100

# Number of features to select
num_features = 500

Load fMRI data and DNN features for training as `bdpy.BData` and `bdpy.dataform.Features` instances, respectively.  

In [None]:
# Load training data (fMRI and features)

train_fmri = bdpy.BData(training_fmri_path)
train_features = Features(training_feature_path)

Define a helper function to train the encoding model for given layer and ROI.

In [None]:
# Helper function to train the encoding model

def train_model(layer, roi):
    print("----------------------------------------")
    print(f"Encoding: {layer} features -> {roi} fMRI")

    # Output directory
    model_dir = os.path.join(encoding_model_path, layer, subject, roi, "model")
    os.makedirs(model_dir, exist_ok=True)

    # Extract fMRI data in the ROI
    brain = train_fmri.select(f"hcp180_{roi}")
    brain_labels = train_fmri.get_labels("stimulus_name")
    print("The shape of fMRI data array: ", brain.shape)

    # Extract features
    feat = train_features.get(layer)
    feat_labels = train_features.labels
    print("The shape of feature data array: ", feat.shape)

    # Normalize data
    brain_mean = np.mean(brain, axis=0)
    brain_norm = np.std(brain, axis=0)
    feat_mean = np.mean(feat, axis=0)
    feat_norm = np.std(feat, axis=0)

    save_array(os.path.join(model_dir, "x_mean.mat"), feat_mean, key="x_mean", dtype=np.float32, sparse=False)
    save_array(os.path.join(model_dir, "x_norm.mat"), feat_norm, key="x_norm", dtype=np.float32, sparse=False)
    save_array(os.path.join(model_dir, "y_mean.mat"), brain_mean, key="y_mean", dtype=np.float32, sparse=False)
    save_array(os.path.join(model_dir, "y_norm.mat"), brain_norm, key="y_norm", dtype=np.float32, sparse=False)

    # Index to align fMRI data and features
    feat_index = np.array([np.where(np.array(feat_labels) == bl) for bl in brain_labels]).flatten()

    # Setup model
    model = FastL2LiR()
    model_param = {
        'alpha':  alpha,
        'n_feat': num_features,
        'dtype':  np.float32
    }

    # Setup model learner
    train = ModelTraining(model, feat, brain)
    train.model_parameters = model_param

    train.X_normalize = {'mean': feat_mean,  'std': feat_norm}
    train.Y_normalize = {'mean': brain_mean, 'std': brain_norm}
    train.X_sort = {'index': feat_index}

    train.dtype = np.float32
    train.save_format = 'bdmodel'
    train.save_path = model_dir

    # Training
    train.run()


Try the training function for layer "fc8" and ROI "V4".

In [None]:
# Train model for fc8 and V4

train_model("fc8", "V4")

The tranind model is saved in `./data/feature_encoders/handson/caffe/VGG19_random5000/<layer>/<subject>/<roi>/model`.
The `model` directory should contain the following files.

- `W.mat`: Weight matrix of the model.
- `b.mat`: Bias vector of the model.
- `x_mean.mat`, `x_norm.mat`, `y_mean.mat`, `y_norm.mat`: Mean and standard deviation of the input and output data.
- `info.yaml`: Runtime information of the model training.

In [None]:
!ls -la ./data/feature_encoders/handson/caffe/VGG19_random5000/fc8/sub-03/V4/model/

Train encoding models for all layers and ROIs defined above.

In [None]:
# Train models for all layers and ROIs

for layer, roi in product(layers, rois):
    train_model(layer, roi)

## Prediction of fMRI responses

Load DNN features for testing.

In [None]:
# Load test data (DNN features)

test_features = Features(test_feature_path)

Run the following code block to predict fMRI responses for all layers and ROIs defined above.

In [None]:
# Predict fMRI responses

for layer, roi in product(layers, rois):
    print("----------------------------------------")
    print(f"Encoding: {layer} features -> {roi} fMRI")

    # Output directory
    output_dir = os.path.join(encoded_fmri_path, layer, subject, roi)
    os.makedirs(output_dir, exist_ok=True)

    # Trained encoding model
    model_dir = os.path.join(encoding_model_path, layer, subject, roi, "model")

    # Extract features
    feat = test_features.get(layer=layer)
    feat = feat.astype(np.float32)
    feat = feat.reshape(feat.shape[0], -1, order='F')
    feat_labels = test_features.labels
    print("The shape of feature data array: ", feat.shape)

    # Normalize data by mean and SD of training data
    feat_mean = load_array(os.path.join(model_dir, 'x_mean.mat'), key='x_mean')  # shape = (1, n_voxels)
    feat_norm = load_array(os.path.join(model_dir, 'x_norm.mat'), key='x_norm')  # shape = (1, n_voxels)

    feat = (feat - feat_mean) / feat_norm

    # Prediction
    model = FastL2LiR()
    test = ModelTest(model, feat)
    test.model_format = 'bdmodel'
    test.model_path = model_dir
    test.dtype = np.float32

    brain_pred = test.run()

    print("The shape of predicted fMRI data array: ", brain_pred.shape)

    # De-normalize predicted fMRI data
    brain_mean = load_array(os.path.join(model_dir, 'y_mean.mat'), key='y_mean')  # shape = (1, shape_features)
    brain_norm = load_array(os.path.join(model_dir, 'y_norm.mat'), key='y_norm')  # shape = (1, shape_features)

    brain_pred = brain_pred * brain_norm + brain_mean

    # Save the predicted fMRI responses
    for i, label in enumerate(feat_labels):
        # Predicted fMRI signal
        _brain = np.array([brain_pred[i,]])  # To make feat shape 1 x M x N x ...

        # Save file name
        save_file = os.path.join(output_dir, '%s.mat' % label)

        # Save
        save_array(save_file, _brain, key='fmri', dtype=np.float32, sparse=False)

    print('Saved %s' % output_dir)


The predicted fMRI reponses are saved in `./data/encoded_fmri/handson/caffe/VGG19_random5000/<layer>/<subject>/<roi>/`.
The directory contains matfiles that contain the predicted fMRI responses for each sample.

In [None]:
!ls ./data/encoded_fmri/handson/caffe/VGG19_random5000/conv1_2/sub-03/V1/

## Evaluation

Load the predicted and ground-truth fMRI responses.

In [None]:
predicted_fmri = DecodedFeatures(encoded_fmri_path, file_key="fmri")
# Note: `DecodedFeatures` class is designed to handle decoded features, but it can also be used to load predicted fMRI signals here
# because the predicted fMRI signals are stored in the same format as decoded features.

test_fmri = bdpy.BData(test_fmri_path)

Calculate correlation coefficients between the predicted and ground-truth fMRI responses as prediction accuracy.

In [None]:
prediction_accuracies = {}

average_sample = True

for layer, roi in product(layers, rois):
    # Extract predicted and ground truth fMRI signals as arrays
    pred_fmri = predicted_fmri.get(layer=layer, subject=subject, roi=roi)
    pred_labels = predicted_fmri.selected_label

    gt_fmri = test_fmri.select(f"hcp180_{roi}")
    gt_labels = test_fmri.get_label("stimulus_name")

    pred_labels = np.array(pred_labels)
    gt_labels = np.array(gt_labels)

    # [Optional] average GT fMRI signals across trials
    if average_sample:
        gt_labels_unique = np.unique(gt_labels)
        gt_fmri = np.stack([np.mean(gt_fmri[np.where(gt_labels == label)[0], :], axis=0) for label in gt_labels_unique])
        gt_labels = gt_labels_unique

    # Sort pred_fmri as gt_fmri
    sort_index = np.array([np.where(pred_labels == label)[0] for label in gt_labels]).ravel()
    pred_fmri = pred_fmri[sort_index, :]

    # Calculate prediction accuracy (voxel-wise correlation between predicted and ground truth fMRI signals; profile correlation)
    r_prof = profile_correlation(pred_fmri, gt_fmri)

    prediction_accuracies[(layer, subject, roi)] = r_prof



Plot the prediction accuracy.

In [None]:
# Make plots

fig = plt.figure(figsize=(16, 12))

xpos = np.arange(len(rois))

subjects = [subject]

for i, layer in enumerate(layers):
    ax = fig.add_subplot(3, 3, i + 1)

    handles = []
    for j, sub in enumerate(subjects):
        y = []
        for k, roi in enumerate(rois):
            r_prof = prediction_accuracies[(layer, sub, roi)]
            y.append(r_prof.ravel())

        #x = xpos + 0.2 * (j - 1)
        x = xpos
 
        p = ax.violinplot(y, positions=x, widths=0.2, showmeans=True, showextrema=False)
        handles.append(p["bodies"][0])

    ax.plot([-1, len(rois)], [0, 0], 'k--', linewidth=1)

    ax.set_title(layer, loc="left")
    ax.set_xlim([-0.5, len(rois) - 0.5])
    ax.set_xticks(xpos)
    ax.set_xticklabels(rois)
    ax.set_ylabel("Prediction accuracy (correlation)")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.legend(handles, subjects, loc='upper left', bbox_to_anchor=(1, 1))

plt.show()