# NIMS-to-BIDS converter v.2
Written by Natalia Vélez, 9/17

Changes in this version:

* Change to file structure: raw data are now stored in `$PI_SCRATCH`
* Incorporate changes to protocol file, including tracking sequence numbers 
* Add support for fieldmaps
* Remove redundancies in the code

Load dependencies:

In [55]:
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import

from builtins import input
from builtins import open
from builtins import str
from future import standard_library
standard_library.install_aliases()
print("Importing Libraries...\n")

import numpy as np
import pandas as pd
import os
import re
from shutil import copyfile
import json
import sys
import subprocess
import glob
from os.path import join as opj # Helper function

Importing Libraries...



Set input and output directories:

In [56]:
home_dir = os.environ['PI_HOME']
scratch_dir = os.environ['PI_SCRATCH']

#project_name =  str(sys.argv[1]).strip(' ') # Uncomment for production
project_name = 'SwiSt'
project_dir = opj(home_dir, project_name)
NIMS = opj(scratch_dir, project_name, 'NIMS_data')
BIDS = opj(project_dir, 'BIDS_data')
BIDS_filename = glob.glob(opj(project_dir, '*BIDS_info*.xlsx'))

#Make sure there's only one bids file
assert len(BIDS_filename) == 1, 'This folder does not have a BIDS_info file or it has more than one info file' 
xls = pd.ExcelFile(BIDS_filename[0])

print('Input: %s' % NIMS)
print('Output: %s' % BIDS)
print('BIDS_info file: %s' % BIDS_filename[0])

Input: /scratch/PI/hyo/SwiSt/NIMS_data
Output: /share/PI/hyo/SwiSt/BIDS_data
BIDS_info file: /share/PI/hyo/SwiSt/BIDS_info.xlsx


Load participant data:

In [57]:
participants = xls.parse('participants')
participants.participant_id = ['sub-%02d' % int(n) for n in participants.participant_id]
participants.head()

Unnamed: 0,participant_id,nims_title,sex,age
0,sub-04,20170709_1311_15436,F,25
1,sub-05,20170710_1918_15441,F,20
2,sub-06,20170712_1910_15454,M,19
3,sub-07,20170713_1949_15463,F,23
4,sub-08,20170714_1812_15473,M,29


Load protocol data:

In [58]:
protocol = xls.parse('protocol', convert_float=False).iloc[1:,]
protocol = protocol[~pd.isnull(protocol.sequence_type)] # Remove columns with missing BIDS data types
protocol.head()

Unnamed: 0,nims_title,sequence_no,NIMS_scan_title,BIDS_scan_title,TaskName,run_number,sequence_type,RepetitionTime,IntendedFor
4,default,4,spiral_fieldmap,fieldmap,,,fmap,,5 6
5,default,5,BOLD_EPI_29mm_2sec_SwiSt,task-swist,Switch/Stay (main task),1.0,func,2.0,
6,default,6,BOLD_EPI_29mm_2sec_SwiSt,task-swist,Switch/Stay (main task),2.0,func,2.0,
7,default,7,spiral_fieldmap,fieldmap,,,fmap,,8 9
8,default,8,BOLD_EPI_29mm_2sec_SwiSt,task-swist,Switch/Stay (main task),3.0,func,2.0,


Find input (NIMS-formatted) files and specify output (BIDS-formatted) files:

In [None]:
session_IDs = participants.nims_title
custom_protocols = np.unique(protocol.nims_title)
custom_protocols = custom_protocols[custom_protocols != 'default']
copy_job = pd.DataFrame(columns = ['session', 'input', 'output', 'info'])

# Helper function: Searches for matching input files
def input_path(row):
    d = row.to_dict()
    
    # Templates: Build search string based on sequence number and type
    input_fname = '*fieldmap.nii.gz' if d['sequence_type'] == 'fmap' else '*_1.nii.gz'
    input_template = opj(NIMS, row['nims_title'], '*_%i_1_%s', input_fname)
    input_search = input_template % (d['sequence_no'], d['NIMS_scan_title'])
    
    # Matches: Find matching sequences
    input_matches = glob.glob(input_search)
    path = input_matches[0] if input_matches else np.nan
    
    return path
    
# Helper function: Builds path for output files
def output_path(row, participant_id):
    d = row.to_dict()
    output_run = '_run-%02d' % d['run_number'] if ~np.isnan(d['run_number']) else ''
    output_bold = '_bold' if d['sequence_type'] == 'func' else ''
    output_filename = '%s_%s%s%s.nii.gz' % (participant_id, d['BIDS_scan_title'], output_run, output_bold)
    output_path = opj(BIDS, participant_id, d['sequence_type'], output_filename)
    
    return output_path
    
# Helper function: Prepares JSON file keys
def output_keys(row, participant_id, session_protocol):
    # Fields common to all sequences
    standard_fields = ['nims_title', 'sequence_no', 'NIMS_scan_title',
                       'BIDS_scan_title', 'run_number', 'sequence_type']
    
    # Remove standard fields and NA'sfrom row 
    # (only custom fields related to the current sequence remain)
    row = row.drop(standard_fields)
    row = row.dropna()
    row_dict = row.to_dict()
    
    # If dictionary contains an IntendedFor field (for fieldmaps), replace the sequence numbers
    # with BIDS-formatted filenames
    if 'IntendedFor' in row_dict:
        # Subject directory
        subj_dir = opj(BIDS, participant_id)
        
        # Get matching functional files
        target_runs_str = row_dict['IntendedFor'].split(' ')
        target_runs = [int(r) for r in target_runs_str]
        target_protocol = session_protocol[session_protocol['sequence_no'].isin(target_runs)]

        # Get BIDS output for each file
        get_target_path = lambda row: output_path(row, participant_id)
        get_rel_path = lambda path: os.path.relpath(path, subj_dir)
        target_full_path = target_protocol.apply(get_target_path, axis = 1)
        target_full_path = target_full_path.tolist()
        target_path = [get_rel_path(path) for path in target_full_path]
        
        # Replace IntendedFor with properly formatted paths
        row_dict['IntendedFor'] = target_path
        
    # Convert row_dict to JSON string
    row_json = json.dumps(row_dict)
    return row_json
    
for session in session_IDs[:1]:
    # Get participant ID
    participant_id = participants[participants.nims_title == session]['participant_id'][0]
    
    # Get correct protocol
    is_custom = session in custom_protocols
    protocol_type = 'CUSTOM' if is_custom else 'DEFAULT'
    protocol_ref = session if is_custom else 'default'
    session_protocol = protocol[protocol.nims_title == protocol_ref]
    
    # Assemble copy_job
    input_files = session_protocol.apply(input_path, axis=1)
    output_files = session_protocol.apply(lambda row: output_path(row, participant_id), axis=1)
    output_info = session_protocol.apply(lambda row: output_keys(row, participant_id, session_protocol), axis=1)
    
    print(input_files)

AttributeError: ("'float' object has no attribute 'split'", 'occurred at index 24')

Scratch space:

In [None]:
protocol = protocol.dropna(axis=0, thresh=3) #get rid of items that don't have a bids equivalent
protocol.run_str = ['_run-%02d' % n if not np.isnan(n) else '_' for n in protocol.run_number]
#protocol.run_number = protocol.run_number.astype('str').str.strip('.0').str.zfill(2) #Convert run int to string

#Create "bold" portion of filename
protocol['bold_filename'] = ''
protocol.loc[protocol['ANAT_or_FUNC'] == 'func', 'bold_filename'] = '_bold'

#Concatanate filepath and clean
protocol.subj_dirs = opj(BIDS, 'subj-###')
protocol["BIDS_scan_title_path"] = BIDS + "/sub-###/" + protocol.ANAT_or_FUNC + "/sub-###_" + protocol.BIDS_scan_title + protocol.run_str + protocol.bold_filename + ".nii.gz"
#protocol.BIDS_scan_title_path = protocol.BIDS_scan_title_path.str.replace('_run-nan', '') #For items that don't have runs

#Create list for NIMS -> bids conversion
NIMS_protocol_filenames = protocol.NIMS_scan_title.tolist() #Convert protocol scan titles to list
NIMS_BIDS_conversion = protocol[["NIMS_scan_title","BIDS_scan_title_path"]]

def check_against_protocol(participants,protocol): 
	
	all_files_correct = True
	
	for index, row in participants.iterrows():

		try:
			pdb.set_trace()
			NIMS_participant = opj(NIMS, row.nims_title)
			NIMS_participant_subdirs = [opj(NIMS_participant, d) for d in os.listdir(NIMS_participant)]
			NIMS_participant_subdirs = [d for d in NIMS_participant_subdirs if os.path.isdir(d)]

			for item in set(NIMS_protocol_filenames):
				protocol_dirs = [d for d in NIMS_participant_subdirs if item in d]
				directory_filenames = []
				for d in protocol_dirs:
					protocol_search = glob.glob(opj(d, "*.nii.gz"))
					directory_filenames.append(protocol_search[0])

				protocol_filenames = NIMS_BIDS_conversion[NIMS_BIDS_conversion.NIMS_scan_title.str.contains(item)]
				protocol_filenames = protocol_filenames.iloc[:,1].tolist()

				if len(directory_filenames) < len(protocol_filenames):
					print('{} : sub-{} : << {} {} files in folder {} files in protocol\n'.                    format(str(row.nims_title), str(row.participant_id), item.rjust(20), len(directory_filenames), len(protocol_filenames)))

				elif len(directory_filenames) > len(protocol_filenames):
					print('{} : sub-{} : >> {} {} files in folder {} files in protocol\n'.                    format(str(row.nims_title), str(row.participant_id), item.rjust(20), len(directory_filenames), len(protocol_filenames)))
					all_files_correct = False
					
				elif len(directory_filenames) == len(protocol_filenames):
					print('{} : sub-{} : == {} {} files in folder {} files in protocol\n'.                    format(str(row.nims_title), str(row.participant_id), item.rjust(20), len(directory_filenames), len(protocol_filenames)))

			print("------------")
		
		except:
			all_files_correct = False
			print("sub-" + str(row.participant_id) + " : -- ERROR - folder is missing \n------------")


	return all_files_correct


# In[ ]:

def write_text_files(participants, protocol): 
	
	def to_file(filename, content):
		file_path = opj(BIDS, filename + '.json') 
		with open(file_path, "w") as text_file:
			text_file.write(content)
	
	#Data Description
	dataset_description = json.dumps({"BIDSVersion": "1.0.0",                                    "License": "",                                    "Name": "dummy task name",                                   "ReferencesAndLinks": ""})
	to_file(str("dataset_description"), str(dataset_description))
	

	#Task Description
	for item in set(protocol.loc[protocol.ANAT_or_FUNC == "func", 'BIDS_scan_title']):
		full_task_name = protocol.loc[protocol.BIDS_scan_title == item, 'full_task_name']
		full_task_name = full_task_name.reset_index(drop=True)[0] #Gets first instance of RT
		
		repetition_time = protocol.loc[protocol.BIDS_scan_title == item, 'repetition_time']
		repetition_time = repetition_time.reset_index(drop=True)[0] #Gets first instance of RT
		task_json = json.dumps({"RepetitionTime": repetition_time, "TaskName" : full_task_name})

		to_file(str(item + "_bold"), str(task_json))

	#TSV
	participant_tsv = participants.loc[:, ['participant_id', 'sex', 'age']]
	participant_tsv.loc[:, 'participant_id'] = "sub-" + participant_tsv.loc[:, 'participant_id'].apply(str)
	#Had to write csv and then change it due to python 2/3 incompatability
	participant_tsv.to_csv(opj(BIDS, 'participants.tsv'), index=False)
	# Read in the file
	with open(opj(BIDS, 'participants.tsv'), 'r') as file :
		filedata = file.read()

	# Replace the target string
	filedata = filedata.replace(',', '\t')

	# Write the file out again
	with open(opj(BIDS, 'participants.tsv'), 'w') as file:
		file.write(filedata)
	
	
# In[ ]:

def convert_to_bids(participants, protocol):
	
	print("Comparing Folders to Protocol...\n")
	
	if check_against_protocol(participants,protocol): #Function returns true is everything matches
		
		print("Creating BIDS_data folder\n")
		#Make BIDS Folder
		makefolder(BIDS)
		participants.participant_id.apply(lambda x: makefolder(opj(BIDS, 'sub-' + str(x), 'anat')))
		participants.participant_id.apply(lambda x: makefolder(opj(BIDS, 'sub-' + str(x), 'func')))
		
		for index, row in participants.iterrows():
			NIMS_participant = opj(NIMS, row.nims_title)
			NIMS_participant_subdirs = [opj(NIMS_participant, d) for d in os.listdir(NIMS_participant)]
			NIMS_participant_subdirs = [d for d in NIMS_participant_subdirs if os.path.isdir(d)]

			for item in set(NIMS_protocol_filenames):
				print(item)
				if item == 'T1w_9mm_sag':
					pdb.set_trace()
				protocol_dirs = [d for d in NIMS_participant_subdirs if item in d]
				directory_filenames = []
				for d in protocol_dirs:
					protocol_search = glob.glob(opj(d, "*.nii.gz"))
					directory_filenames.append(protocol_search[0])

				protocol_filenames = NIMS_BIDS_conversion[NIMS_BIDS_conversion.NIMS_scan_title.str.contains(item)]
				protocol_filenames = protocol_filenames.iloc[:,1].tolist()

				for index, item in enumerate(directory_filenames):
					oldpath = directory_filenames[index]
					newpath = (protocol_filenames[index].replace("###", str(row.participant_id)))
					copyfile(oldpath, newpath)

					print("sub-" + str(row.participant_id) + ": ++ "+ os.path.basename(newpath).rjust(20))
			print("------------")

		print("\nCreating JSON and .tsv Files")
		
		write_text_files(participants, protocol)
	   
		print("\nDone!")


# In[ ]:

def reorient_and_skullstrip(participants, protocol):
	
	print("Comparing Folders to Protocol...\n")
	
	if check_against_protocol(participants,protocol): #Function returns true is everything matches
		
		print("Creating BIDS_data folder\n")
		#Make BIDS Folder
		makefolder(BIDS)
		participants.participant_id.apply(lambda x: makefolder(BIDS + 'sub-' + str(x) + "/anat"))
		participants.participant_id.apply(lambda x: makefolder(BIDS + 'sub-' + str(x) + "/func"))
		
		for index, row in participants.iterrows():
			#Get files
			NIMS_participant_filenames = opj(NIMS, row.nims_title)

			#Delete all non-nii.gz files from list
			NIMS_participant_filenames = [x for x in NIMS_participant_filenames if ".nii.gz"  in x]

			for item in set(NIMS_protocol_filenames):
				directory_filenames = [x for x in NIMS_participant_filenames if item in x]
				protocol_filenames = NIMS_BIDS_conversion[NIMS_BIDS_conversion.NIMS_scan_title.str.contains(item)]
				protocol_filenames = protocol_filenames.iloc[:,1].tolist()

				for index, item in enumerate(directory_filenames):
					oldpath = (NIMS + row.nims_title + "/" + directory_filenames[index])
					newpath = (protocol_filenames[index].replace("###", str(row.participant_id)))
					copyfile(oldpath, newpath)

					print("sub-" + str(row.participant_id) + ": ++ "+ os.path.basename(newpath))
			print("------------")

		print("\nCreating JSON and .tsv Files")
		
		write_text_files(participants, protocol)
	   
		print("\nDone!")

# Run conversion
#check_against_protocol(participants, protocol)
convert_to_bids(participants, protocol)

In [98]:
os.path.relpath('/share/PI/hyo/SwiSt/BIDS_data/sub-04/func/sub-04_task-swist_run-01_bold.nii.gz', '/share/PI/hyo/SwiSt/BIDS_data/sub-04')

'func/sub-04_task-swist_run-01_bold.nii.gz'