1. Process Label files

In [5]:
import pandas as pd

label = pd.read_csv('/Users/celery/Research/dataset/ABCD/abcd_labels.csv')

#remove DNAR_ prefix
label['src_subject_id'] = label['src_subject_id'].str.replace('_', '', regex = False)
#remove anything but the baseline prefix in the eventname column
label = label[label['eventname'].str.startswith('baseline')]
#saving
label.to_csv('/Users/celery/Research/dataset/ABCD/abcd_labels_cleaned.csv', index = False)

2. Parse the FC mat file into 4199 68x68 .mat file for each IDs

In [None]:
import h5py
import numpy as np
import scipy.io as sio
import os

def _arr_to_string(arr):
    arr = np.asarray(arr)
    if arr.size == 0:
        return ''
    if arr.dtype.kind in ('u', 'i'):
        vals = arr.flatten()
        vals = vals[vals != 0]
        try:
            return ''.join(chr(int(v)) for v in vals)
        except Exception:
            try:
                vals16 = arr.astype('uint16').flatten()
                vals16 = vals16[vals16 != 0]
                return ''.join(chr(int(v)) for v in vals16)
            except Exception:
                return ''.join(map(str, vals))
    if arr.dtype.kind in ('S', 'U'):
        parts = []
        for e in arr.flatten():
            if isinstance(e, bytes):
                try:
                    parts.append(e.decode('utf-8'))
                except Exception:
                    parts.append(e.decode('latin1', errors='ignore'))
            else:
                parts.append(str(e))
        return ''.join(parts)
    return str(arr)

def read_id_ndar_qc(mat_path, id_key='id_ndar_qc'):
    ids = []
    with h5py.File(mat_path, 'r') as f:
        refs_ds = f[id_key]
        refs = refs_ds[:]
        flat = refs.reshape(-1)
        for i, r in enumerate(flat):
            if isinstance(r, np.ndarray):
                if r.dtype.kind in ('u', 'i', 'S', 'U'):
                    ids.append(_arr_to_string(r))
                    continue
                if r.size == 1:
                    r = r.reshape(-1)[0]
                else:
                    try:
                        ids.append(_arr_to_string(r))
                        continue
                    except Exception:
                        r = r.reshape(-1)[0]
            try:
                if isinstance(r, h5py.Reference) or type(r).__name__ == 'Reference':
                    ds = f[r]
                    data = ds[:]
                    ids.append(_arr_to_string(data))
                elif isinstance(r, (bytes, str)):
                    name = r.decode('utf-8') if isinstance(r, bytes) else r
                    if name in f:
                        ids.append(_arr_to_string(f[name][:]))
                    else:
                        ids.append(name)
                else:
                    try:
                        ids.append(_arr_to_string(np.array(r)))
                    except Exception as e:
                        print(f'Warning: could not decode element {i} (type={type(r)}).')
                        ids.append(None)
            except Exception as err:
                print(f'Warning: failed to dereference element {i}: {err!r}')
                ids.append(None)
    return ids

# --- match IDs with RSFC and save ---
matfile = '/Users/celery/Research/dataset/ABCD/abcd_DK_RSFC.mat'
output_dir = '/Users/celery/Research/dataset/ABCD/indv_matfile'
os.makedirs(output_dir, exist_ok=True)

# Extract IDs
ids = read_id_ndar_qc(matfile)

# Read RSFC data
with h5py.File(matfile, 'r') as f:
    rsfc = f['abcd_RSFC_DK'][:]  # shape (subjects, 68, 68)

# Check dimensions match
if len(ids) != rsfc.shape[0]:
    raise ValueError(f"Number of IDs ({len(ids)}) does not match RSFC subjects ({rsfc.shape[0]})")

# Save each subject
for i, subj_id in enumerate(ids):
    if subj_id is None or subj_id.strip() == "":
        continue  # skip invalid IDs
    filename = f"{subj_id}_dk_correlation.mat"
    filepath = os.path.join(output_dir, filename)
    sio.savemat(filepath, {"correlation": rsfc[i]})

print(f"Saved {len(ids)} files to {output_dir}")


Saved 4199 files to /Users/celery/Research/dataset/ABCD/indv_matfile
