# Convert CamCAN dataset to BIDS format

There were some empty folders in the dataset, so I pruned those using `find . -type d -empty -delete`

For the most part, the changes here just copy files and write out metadata with sorted keys and indenting (for readability). However, the field maps needed a lot of metadata-correction and the functional scans needed the "TaskName" field.

In [1]:
import re
import json
import os.path as op
from os import makedirs
from glob import glob
from shutil import copyfile
from datetime import datetime

import pandas as pd

In [2]:
in_dir = '/home/tsalo006/CamCAN'
pseudo_bids_dir = op.join(in_dir, 'cc700/mri/pipeline/release004/BIDS_20190411/')
out_dir = op.join(in_dir, 'dset')

participant_ids = sorted(glob(op.join(pseudo_bids_dir, 'epi_movie', 'sub-*')))
participant_ids = [op.basename(pid) for pid in participant_ids]

In [3]:
# Make top-level files
participants_file = op.join(in_dir, 'dataman/useraccess/processed/Taylor_Salo_298/standard_data.csv')
participants = pd.read_csv(participants_file)
participants = participants.rename(columns={
    'CCID': 'participant_id',
    'Age': 'age',
    'Sex': 'sex',
    'Hand': 'handedness_score'})
participants['sex'] = participants['sex'].map({'FEMALE': 'F', 'MALE': 'M'})
participants['participant_id'] = 'sub-' + participants['participant_id']
participants = participants.fillna('n/a')
participants = participants[['participant_id', 'age', 'sex', 'handedness_score']]
participants = participants.loc[participants['participant_id'].isin(participant_ids)]
participants.to_csv(op.join(out_dir, 'participants.tsv'), sep='\t', index=False)

dd_file = op.join(out_dir, 'dataset_description.json')
dd_dict = {'Name': 'CamCAN film-viewing dataset',
           'BIDSVersion': '1.2.1',
           'LICENSE': 'DO NOT SHARE!'}
with open(dd_file, 'w') as fo:
    json.dump(dd_dict, fo, sort_keys=True, indent=4)
    
with open(op.join(out_dir, 'README'), 'w') as fo:
    fo.write('The CamCAN dataset. Not to be shared.')

with open(op.join(out_dir, 'CHANGES'), 'w') as fo:
    date = datetime.today().strftime('%Y/%m/%d')
    fo.write('{}: Convert CamCAN-provided dataset to BIDS format.'.format(date))

In [4]:
keep_mods = ['fmap_movie', 'epi_movie', 'anat']
mod_dict = {'fmap_movie': 'fmap', 'epi_movie': 'func', 'anat': 'anat'}
df = pd.DataFrame(columns=['out'])
df.index.name = 'in'

for pid in participant_ids:
    for mod in keep_mods:
        files = sorted(glob(op.join(pseudo_bids_dir, mod, pid, '*', '*')))
        out_sub_dir = op.join(out_dir, pid, mod_dict[mod])
        makedirs(out_sub_dir, exist_ok=True)
        
        if mod == 'anat':
            for f in files:
                out_file = op.join(out_sub_dir, op.basename(f))
                if f.endswith('.json'):
                    with open(f, 'r') as fo:
                        metadata = json.load(fo)
                    
                    with open(out_file, 'w') as fo:
                        json.dump(metadata, fo, sort_keys=True, indent=4)
                elif f.endswith('.nii.gz'):
                    copyfile(f, out_file)
                df.loc[f, 'out'] = out_file
        elif mod == 'epi_movie':
            for f in files:
                fn = op.basename(f)
                sid = fn.split('_')[0]
                echo_nums = re.findall('_echo([0-9])\.', fn)
                assert len(echo_nums) == 1, 'too many echoes for {}'.format(f)
                echo_num = echo_nums[0]
                
                out_file = op.join(out_sub_dir, '{0}_task-movie_echo-{1}_bold'.format(sid, echo_num))
                if f.endswith('.json'):
                    out_file = out_file+'.json'
                    with open(f, 'r') as fo:
                        metadata = json.load(fo)
                    
                    metadata['TaskName'] = 'film viewing'
                    with open(out_file, 'w') as fo:
                        json.dump(metadata, fo, sort_keys=True, indent=4)
                elif f.endswith('.nii.gz'):
                    out_file = out_file+'.nii.gz'
                    copyfile(f, out_file)
                df.loc[f, 'out'] = out_file
        else:
            # fmaps are in M1, M2, PD format, but are not labeled as such
            sid = op.basename(files[0]).split('_')[0]
            json_files = [f for f in files if f.endswith('json')]
            echo_times = []

            intended_fors = ['func/{0}_task-movie_echo-{1}_bold.nii.gz'.format(sid, e) for e in range(1, 6)]

            # get echo times
            for f in json_files:
                with open(f, 'r') as fo:
                    metadata = json.load(fo)
                echo_times.append(metadata['EchoTime'])
            echo_times = sorted(list(set(echo_times)))

            # determine new names and metadata
            for f in json_files:
                with open(f, 'r') as fo:
                    metadata = json.load(fo)

                if 'P' in metadata['ImageType']:
                    suff = 'phasediff'
                    metadata['EchoTime1'] = echo_times[0]
                    metadata['EchoTime2'] = echo_times[1]
                else:
                    echo_num = echo_times.index(metadata['EchoTime']) + 1
                    if 'EchoNumber' not in metadata.keys():
                        metadata['EchoNumber'] = echo_num
                    suff = 'magnitude{}'.format(echo_num)
                metadata['IntendedFor'] = intended_fors

                # write out files
                out_file = '{}_acq-movie_{}'.format(sid, suff)
                out_json_file = op.join(out_sub_dir, out_file+'.json')
                out_nii_file = op.join(out_sub_dir, out_file+'.nii.gz')
                in_json_file = f
                in_nii_file = f.replace('.json', '.nii.gz')
                df.loc[in_nii_file, 'out'] = out_nii_file
                df.loc[in_json_file, 'out'] = out_json_file
                
                copyfile(in_nii_file, out_nii_file)
                with open(out_json_file, 'w') as fo:
                    json.dump(metadata, fo, sort_keys=True, indent=4)

In [5]:
df = df.reset_index()
df.to_csv('camcan_renamed_files.csv', index=False)