# Evaluate completion of fMRIPrep on datasets

In [1]:
import os.path as op
from glob import glob

import pandas as pd

In [2]:
# Constants
project_dir = "/home/data/nbc/misc-projects/Salo_PowerReplication/"

dsets = {
    "dset-camcan": [
        "{sub}.html",
        "{sub}/func/{sub}_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-movie_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-movie_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-movie_echo-5_space-scanner_desc-partialPreproc_bold.nii.gz",
    ],
    "dset-cambridge": [
        "{sub}.html",
        "{sub}/func/{sub}_task-rest_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-rest_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-rest_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-rest_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz",
    ],
    "dset-dupre": [
        "{sub}.html",
        "{sub}/func/{sub}_task-rest_run-1_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-rest_run-1_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-rest_run-1_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz",
    ],
    "dset-dalenberg": [
        "{sub}.html",
        "{sub}/func/{sub}_task-images_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-images_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-images_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz",
    ],
    "dset-cohen": [
        "{sub}.html",
        "{sub}/func/{sub}_task-bilateralfingertapping_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-bilateralfingertapping_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-bilateralfingertapping_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz",
        "{sub}/func/{sub}_task-bilateralfingertapping_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz",
    ],
}

In [3]:
# Find failed subjects
for dset, target_files in dsets.items():
    print(dset)
    dset_dir = op.join(project_dir, dset)
    deriv_dir = op.join(dset_dir, "derivatives/fmriprep/")
    if not op.isdir(deriv_dir):
        print("\tDataset not yet processed.")
        continue

    participants_file = op.join(dset_dir, "participants.tsv")
    participants_df = pd.read_table(participants_file)
    subject_list = participants_df["participant_id"].tolist()
    failed_subjects = []
    for sub in subject_list:
        tfs = [op.join(deriv_dir, temp.format(sub=sub)) for temp in target_files]
        if not all(op.isfile(tf) for tf in tfs):
            failed_subjects.append(sub)

    if failed_subjects:
        print("\t" + "\n\t".join(failed_subjects))
    
    if dset == "dset-camcan":
        camcan_failed = failed_subjects

dset-camcan
	sub-CC120409
	sub-CC120469
	sub-CC120470
	sub-CC120550
	sub-CC120640
	sub-CC120727
	sub-CC120764
	sub-CC120795
	sub-CC221585
	sub-CC410243
	sub-CC410248
	sub-CC410251
	sub-CC410284
	sub-CC410287
	sub-CC410289
	sub-CC410297
	sub-CC410323
	sub-CC510076
	sub-CC510086
	sub-CC510115
	sub-CC510161
	sub-CC510163
	sub-CC510208
	sub-CC510220
	sub-CC510226
	sub-CC510473
	sub-CC510474
	sub-CC510486
	sub-CC510534
	sub-CC510548
	sub-CC510551
	sub-CC510568
	sub-CC510609
	sub-CC610631
	sub-CC610658
	sub-CC610671
	sub-CC620005
	sub-CC620026
	sub-CC620044
	sub-CC620073
	sub-CC620085
	sub-CC620821
	sub-CC620919
	sub-CC621184
	sub-CC621199
	sub-CC621248
	sub-CC621284
	sub-CC621642
	sub-CC710037
	sub-CC710088
	sub-CC710099
	sub-CC710131
	sub-CC710154
	sub-CC710176
	sub-CC710313
	sub-CC710342
	sub-CC710350
	sub-CC710382
	sub-CC710416
	sub-CC710429
	sub-CC710446
	sub-CC710462
	sub-CC710486
	sub-CC710494
	sub-CC710518
	sub-CC710548
	sub-CC710551
	sub-CC710566
	sub-CC710591
	sub-CC710664
	sub-CC7

In [4]:
# Find the missing files
for dset, target_files in dsets.items():
    print(dset)
    dset_dir = op.join(project_dir, dset)
    deriv_dir = op.join(dset_dir, "derivatives/fmriprep/")
    if not op.isdir(deriv_dir):
        print("\tDataset not yet processed.")
        continue

    participants_file = op.join(dset_dir, "participants.tsv")
    participants_df = pd.read_table(participants_file)
    subject_list = participants_df["participant_id"].tolist()
    failed_subjects = []
    for sub in subject_list:
        tfs = [op.join(deriv_dir, temp.format(sub=sub)) for temp in target_files]
        for tf in tfs:
            if not op.isfile(tf):
                print("\t{}".format(op.basename(tf)))

dset-camcan
	sub-CC120409.html
	sub-CC120409_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120409_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120409_task-movie_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120409_task-movie_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120409_task-movie_echo-5_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120469.html
	sub-CC120469_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120469_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120469_task-movie_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120469_task-movie_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120469_task-movie_echo-5_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120470.html
	sub-CC120470_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC120470_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.ni

	sub-CC610631.html
	sub-CC610631_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610631_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610631_task-movie_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610631_task-movie_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610631_task-movie_echo-5_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610658.html
	sub-CC610658_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610658_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610658_task-movie_echo-3_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610658_task-movie_echo-4_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610658_task-movie_echo-5_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610671.html
	sub-CC610671_task-movie_echo-1_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC610671_task-movie_echo-2_space-scanner_desc-partialPreproc_bold.nii.gz
	sub-CC

dset-dupre
dset-dalenberg
dset-cohen


## Clean up the working directory for CamCAN

In [5]:
participants_file = op.join(project_dir, "dset-camcan", "participants.tsv")
participants_df = pd.read_table(participants_file)

In [6]:
len(camcan_failed)

133

In [7]:
subs = participants_df["participant_id"].tolist()
camcan_successful = sorted(list(set(subs) - set(camcan_failed)))
len(camcan_successful)

509

In [9]:
from shutil import rmtree

wf_dir = "/scratch/nbc/tsalo006/Salo_PowerReplication/dset-camcan/fmriprep-20.2.1/fmriprep_wf"
for sub in camcan_successful:
    sub_name = sub.split("-")[1]
    sub_dir = f"single_subject_{sub_name}_wf"
    sub_wf_dir = op.join(wf_dir, sub_dir)
    if op.isdir(sub_wf_dir):
        rmtree(sub_wf_dir)
    else:
        print(f"Not removing {sub_name}")