In [1]:
import os.path as op
from glob import glob

import numpy as np
import pandas as pd

In [2]:
dsets = ["dset-cambridge", "dset-camcan", "dset-cohen", "dset-dalenberg", "dset-dupre"]
metrics = [("gsr_x", "upper"), ("gsr_y", "upper"), ("fber", "lower")]

base_dir = "/home/data/nbc/misc-projects/Salo_PowerReplication/"
sub_dir = "derivatives/mriqc/"

In [3]:
for dset in dsets:
    print(dset)
    dset_bad_subs = []
    target_files = sorted(glob(op.join(base_dir, dset, sub_dir, "*_bold.tsv")))
    for i_file, target_file in enumerate(target_files):
        name = op.basename(target_file).split(".")[0]
        df = pd.read_table(target_file)
        
        if i_file == 0:
            other_files = sorted(glob(op.join(base_dir, dset, sub_dir, "*.tsv")))
            other_files = sorted(list(set(other_files) - set(target_files)))
            other_names = [op.basename(of).split(".")[0] for of in other_files]
            dset_df = pd.DataFrame(index=df["participant_id"], columns=other_names)
        
        dset_df[name] = np.nan

        all_bad_subs = []
        for metric, side in metrics:
            values = df[metric].values
            mean = values.mean()
            std = values.std()
            threshold = std * 3
            if side == "upper":
                max_val = mean + threshold
                bad_idx = df.loc[df[metric] > max_val].index
            elif side == "lower":
                min_val = mean - threshold
                bad_idx = df.loc[df[metric] < min_val].index
            bad_subs = df.loc[bad_idx, "participant_id"].tolist()
            all_bad_subs += bad_subs

        dset_df.loc[all_bad_subs, name] = 1
        dset_bad_subs += all_bad_subs
        print(f"\t{name}: {', '.join(all_bad_subs)}")
    dset_bad_subs = sorted(list(set(dset_bad_subs)))
    if dset == "dset-camcan":
        # Add one subject who fails fMRIPrep
        dset_bad_subs.append("sub-CC221585")
        dset_bad_subs = sorted(list(set(dset_bad_subs)))

    print(f"\n{dset}: {', '.join(dset_bad_subs)}\n")
    participants_file = op.join(base_dir, dset, "participants.tsv")
    participants_df = pd.read_table(participants_file)
    participants_df["exclude"] = 0
    participants_df.loc[participants_df["participant_id"].isin(dset_bad_subs), "exclude"] = 1
    participants_df.to_csv(participants_file, sep="\t", index=False)

dset-cambridge
	task-rest_echo-1_bold: 
	task-rest_echo-2_bold: sub-20494
	task-rest_echo-3_bold: sub-20494, sub-20859
	task-rest_echo-4_bold: sub-20494, sub-20859, sub-20863

dset-cambridge: sub-20494, sub-20859, sub-20863

dset-camcan
	task-movie_echo-1_bold: sub-CC221935, sub-CC610658, sub-CC221336, sub-CC510043
	task-movie_echo-2_bold: sub-CC221595, sub-CC223286
	task-movie_echo-3_bold: sub-CC221040, sub-CC221595, sub-CC223286, sub-CC321107, sub-CC610061
	task-movie_echo-4_bold: sub-CC221040, sub-CC221595, sub-CC223286, sub-CC321107, sub-CC320336, sub-CC610061
	task-movie_echo-5_bold: sub-CC321107, sub-CC320336, sub-CC610061

dset-camcan: sub-CC221040, sub-CC221336, sub-CC221585, sub-CC221595, sub-CC221935, sub-CC223286, sub-CC320336, sub-CC321107, sub-CC510043, sub-CC610061, sub-CC610658

dset-cohen
	task-bilateralfingertapping_echo-1_bold: 
	task-bilateralfingertapping_echo-2_bold: 
	task-bilateralfingertapping_echo-3_bold: 
	task-bilateralfingertapping_echo-4_bold: 

dset-cohen: