<a href="https://colab.research.google.com/github/MinyuChan-vem/NMA_course-content/blob/main/HCP_language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The HCP dataset comprises task-based fMRI from a large sample of human subjects. The NMA-curated dataset includes time series data that has been preprocessed and spatially-downsampled by aggregating within 360 regions of interest.

In order to use this dataset, please electronically sign the HCP data use terms at ConnectomeDB. Instructions for this are on pp. 24-25 of the HCP Reference Manual.

In this notebook, NMA provides code for downloading the data and doing some basic visualisation and processing.

For a detailed description of the tasks have a look pages 45-54 of the HCP reference manual.

In [1]:
# @title Install dependencies
!pip install nilearn --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#@title Figure settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle")

In [4]:
# The data shared for NMA projects is a subset of the full HCP dataset
N_SUBJECTS = 100

# The data have already been aggregated into ROIs from the Glasser parcellation
N_PARCELS = 360

# The acquisition parameters for all tasks were identical
TR = 0.72  # Time resolution, in seconds

# The parcels are matched across hemispheres with the same order
HEMIS = ["Right", "Left"]

# Each experiment was repeated twice in each subject
RUNS   = ['LR','RL']
N_RUNS = 2

# There are 7 tasks. Each has a number of 'conditions'
# TIP: look inside the data folders for more fine-graned conditions

EXPERIMENTS = {
    'MOTOR'      : {'cond':['lf','rf','lh','rh','t','cue']},
    'WM'         : {'cond':['0bk_body','0bk_faces','0bk_places','0bk_tools','2bk_body','2bk_faces','2bk_places','2bk_tools']},
    'EMOTION'    : {'cond':['fear','neut']},
    'GAMBLING'   : {'cond':['loss','win']},
    'LANGUAGE'   : {'cond':['math','story']},
    'RELATIONAL' : {'cond':['match','relation']},
    'SOCIAL'     : {'cond':['ment','rnd']}
}

Downloading data

In [5]:
# @title Download data file
import os, requests

fname = "hcp_task.tgz"
url = "https://osf.io/2y3fw/download"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)

In [6]:
# The download cells will store the data in nested directories starting here:
HCP_DIR = "./hcp_task"

# importing the "tarfile" module
import tarfile

# open file
with tarfile.open(fname) as tfile:
  # extracting file
  tfile.extractall('.')

subjects = np.loadtxt(os.path.join(HCP_DIR, 'subjects_list.txt'), dtype='str')

In [7]:
def load_single_timeseries(subject, experiment, run, remove_mean=True):
  """Load timeseries data for a single subject and single run.

  Args:
    subject (str):      subject ID to load
    experiment (str):   Name of experiment
    run (int):          (0 or 1)
    remove_mean (bool): If True, subtract the parcel-wise mean (typically the mean BOLD signal is not of interest)

  Returns
    ts (n_parcel x n_timepoint array): Array of BOLD data values

  """
  bold_run  = RUNS[run]
  bold_path = f"{HCP_DIR}/subjects/{subject}/{experiment}/tfMRI_{experiment}_{bold_run}"
  bold_file = "data.npy"
  ts = np.load(f"{bold_path}/{bold_file}")
  if remove_mean:
    ts -= ts.mean(axis=1, keepdims=True)
  return ts

Code to concatenate language data (2 runs, 316 frames)

In [8]:
from pathlib import Path
import os

def concatenate_task_data():
   """
   Concatenate data.npy files for all subjects for each task.
   For each task, concatenate LR and RL runs for each subject, then stack across subjects.

   Returns:
       task_data (dict): Dictionary mapping task names to arrays of shape (n_subjects, n_parcels, n_timepoints_total)
   """
   task_data = {task: [] for task in EXPERIMENTS}

   # Iterate through subjects
   for subject in subjects:
       subject_folder = Path(HCP_DIR) / "subjects" / subject
       if not subject_folder.is_dir():
           print(f"Subject directory {subject_folder} does not exist. Skipping.")
           continue

       # Iterate through tasks
       for task in EXPERIMENTS:
           task_folder = subject_folder / task
           if not task_folder.is_dir():
               print(f"Task directory {task_folder} does not exist for subject {subject}. Skipping.")
               continue

           # Load and concatenate LR and RL runs
           subject_task_data = []
           for run_idx, run_suffix in enumerate(RUNS):
               try:
                   ts = load_single_timeseries(subject, task, run_idx, remove_mean=False)
                   subject_task_data.append(ts)
               except Exception as e:
                   print(f"Error loading data.npy for {subject}/{task}/tfMRI_{task}_{run_suffix}: {e}")
                   subject_task_data.append(None)

           # Check if any run failed to load
           if any(x is None for x in subject_task_data):
               print(f"Skipping subject {subject} for task {task} due to missing run data.")
               continue

           # Concatenate LR and RL runs along the time axis (axis=1)
           try:
               concatenated_ts = np.concatenate(subject_task_data, axis=1)
               task_data[task].append(concatenated_ts)
           except Exception as e:
               print(f"Error concatenating runs for {subject}/{task}: {e}")
               continue

   # Stack subject arrays for each task
   result = {}
   for task in task_data:
       if not task_data[task]:
           print(f"No data concatenated for task {task}.")
           continue
       try:
           # Stack along a new axis (0) to get (n_subjects, n_parcels, n_timepoints_total)
           task_array = np.stack(task_data[task], axis=0)
           result[task] = task_array
           print(f"Task {task}: Concatenated array shape {task_array.shape}")
       except Exception as e:
           print(f"Error stacking subjects for task {task}: {e}")

   return result

task_data = concatenate_task_data()
language_data = task_data["LANGUAGE"]

Task MOTOR: Concatenated array shape (100, 360, 568)
Task WM: Concatenated array shape (100, 360, 810)
Task EMOTION: Concatenated array shape (100, 360, 352)
Task GAMBLING: Concatenated array shape (100, 360, 506)
Task LANGUAGE: Concatenated array shape (100, 360, 632)
Task RELATIONAL: Concatenated array shape (100, 360, 464)
Task SOCIAL: Concatenated array shape (100, 360, 548)


In [9]:
# prompt: print list of ROIs

filename = os.path.join(HCP_DIR, 'MMP_parcellation_updated.txt')
roi_info = pd.read_csv(filename, sep='\t')
roi_info

FileNotFoundError: [Errno 2] No such file or directory: './hcp_task/MMP_parcellation_updated.txt'

In [16]:
import pandas as pd

# === STEP 1: Load Excel ===
df = pd.read_excel('Glasser_2016_Table.xlsx', sheet_name='Sheet1', skiprows=1)
# print(df.columns.tolist())


# Replace with your actual column name
roi_column = 'Area\xa0Description'

# === STEP 2: Brain Regions from Literature ===
literature_regions = [
    "superior temporal",       # catches STG, posterior STG
    "supramarginal",
    "wernicke",
    "middle temporal",         # MTG
    "inferior temporal",       # ITG
    "angular",                 # angular gyrus
    "pars orbitalis",          # IFGpo
    "inferior frontal",        # catches IFG as well
    "dorsomedial prefrontal",
    "pars triangularis",       # IFGpt
    "pars opercularis",        # IFGop
    "inferior frontal sulcus", # IFS
    "broca",
    "sensorimotor",
    "visual word"
]

# === STEP 3: Matching Function ===
def match_region(roi_name, region_list):
    roi_name = str(roi_name).lower()
    return any(region.lower() in roi_name for region in region_list)

# === STEP 4: Filter Matching ROIs ===
matched_df = df[df[roi_column].apply(lambda x: match_region(x, literature_regions))]

# === STEP 5: Output ===
print("Matched ROIs:")
print(matched_df)

# Save if needed
matched_df.to_excel("matched_rois_from_literature.xlsx", index=False)


['Parcel\nIndex', 'Area\nName', 'Area\xa0Description', 'New?', 'Sections', 'Other\xa0Names', 'Key\xa0Studies']
Matched ROIs:
    Parcel\nIndex Area\nName                Area Description New?  Sections  \
22             23         MT           Middle Temporal\nArea   No      5,15   
27             28        STV  Superior Temporal\nVisual Area  Yes  11,15,17   

    Other Names                                        Key Studies  
22  hOC5, hOC5d  Abdollahi et al 2014, Kolster et al 2010, Mali...  
27          NaN                                                NaN  


In [23]:
import pandas as pd

# === STEP 1: Load Excel and clean headers ===
df = pd.read_excel('Glasser_2016_Table.xlsx', sheet_name='Sheet1', skiprows=1)

# Clean column names: remove leading/trailing spaces and \n
df.columns = df.columns.str.strip().str.replace('\n', ' ', regex=True)

# Clean "Area Description" entries
df['Area\xa0Description'] = df['Area\xa0Description'].astype(str).str.strip().str.replace('\n', ' ', regex=True)

# Define the column to match on
roi_column = 'Area\xa0Description'

# === STEP 2: Brain Regions from Literature ===
literature_regions = [
    # Full names
    "superior temporal", "posterior superior temporal", "supramarginal",
    "wernicke", "middle temporal", "inferior temporal", "angular",
    "pars orbitalis", "inferior frontal", "dorsomedial prefrontal",
    "pars triangularis", "pars opercularis", "inferior frontal sulcus",
    "broca", "sensorimotor", "visual word",

    # Common abbreviations
    "STG", "MTG", "ITG", "IFGpo", "IFGpt", "IFGop", "SMG", "AG", "IFS", "DMPFC", "VWFA"
]

# === STEP 3: Matching Function (case-insensitive substring match) ===
def match_region(roi_name, region_list):
    roi_name = str(roi_name).lower()
    return any(region.lower() in roi_name for region in region_list)

# === STEP 4: Filter Matching ROIs ===
matched_df = df[df[roi_column].apply(lambda x: match_region(x, literature_regions))]

# === STEP 5: Output Results ===
print("Matched ROIs:")
print(matched_df[[roi_column, 'Other\xa0Names', 'Key\xa0Studies']])

# Save cleaned matched ROIs
matched_df.to_excel("matched_rois_from_literature_cleaned.xlsx", index=False)


Matched ROIs:
                      Area Description        Other Names  \
1        Medial Superior Temporal Area  MSTv, hOC5, hOC5v   
22                Middle Temporal Area        hOC5, hOC5d   
24           PeriSylvian Language Area                NaN   
25      Superior Frontal Language Area                NaN   
27       Superior Temporal Visual Area                NaN   
80                           Area IFSp                NaN   
81                           Area IFSa                NaN   
111  Anterior Agranular Insula Complex           Iai, Ial   
122                          Area STGa                NaN   

                                           Key Studies  
1    Abdollahi et al 2014, Kolster et al 2010, Mali...  
22   Abdollahi et al 2014, Kolster et al 2010, Mali...  
24                                                 NaN  
25                                                 NaN  
27                                                 NaN  
80                               