Gather necessary pre-requisites:

In [2]:
# import everything you need
import logging
import h5py  # for interacting with HDF5 binary data format
import nipype.pipeline.engine as pe  # pipeline engine
import nipype.interfaces.utility as util  # utility
from nipype.interfaces.base import Bunch  # Bunch objects fro model specification etc.
import nipype.algorithms.modelgen as model  # model generation
import nipype.algorithms.rapidart as ra  # artifact detection
import nipype.interfaces.fsl as fsl
from niflow.nipype1.workflows.fmri.fsl import create_susan_smooth
import nipype.interfaces.spm as spm
from nipype.utils.filemanip import loadpkl  # to load pklz files (gzipped python storage archives)
import nilearn.image as ni_img
import nilearn.plotting as ni_plt
import nilearn.glm as ni_glm
from nilearn import datasets
import nltools.mask as nl_mask  # to work with my ROI material
import nibabel as nb
import numpy as np
import scipy as sp
import pandas as pd
# activate inline magics
%matplotlib inline 
import matplotlib.pyplot as plt
import matplotlib.image as mpl_img
import os, operator, re, json, random
from functools import reduce
from itertools import zip_longest, tee



Write useful functions

In [3]:
# write update function for file list to data dictionary
def collect_files():

    '''Collect all current files in data-associated directories and return them in lists by level'''
    # collect image files
    files = sorted([os.path.join(subdir, content)
                   for subdir in FED_dirs
                   for content in os.listdir(subdir)
                   if re.match(r'.*.(nii|nii.gz|json)$', content)])
    # for tissue probability and WM boundaries too (These are the originals! Processed files will be saved in FED_dir^^)
    tpms = sorted([os.path.join(subdir, tpm)
                  for subdir in TPM_dirs
                  for tpm in os.listdir(subdir)
                  if re.match(r'^(c1|c2).*.nii$', tpm)])
    # collect relevant fMRI parameter files group analysis files
    par_files = sorted([os.path.join(subdir, content)
                       for subdir in FED_dirs
                       for content in os.listdir(subdir)
                       if re.match(r'.*(par|matrix)$', content)])

    # append to files
    files.extend(tpms)
    files.extend(par_files)

    # collect relevant fMRI parameter files group analysis files
    tpm_group = sorted([os.path.join(subdir, content)
                       for subdir in VBM_GROUP_dirs
                       for content in os.listdir(subdir)
                       if re.match(r'.*_6.nii$', content)])

    fmri_group = sorted([os.path.join(subdir, content)
                        for subdir in fMRI_GROUP_dirs
                        for content in os.listdir(subdir)
                        if re.match(r'.*(.nii).*', content)])


    # collect general/group level files to data
    content_group = tpm_group + fmri_group


    # return the updated file lists for group and collective subjects
    return files, content_group

def update_files(files):
    '''Update files in data using the collect_files() function or any other touple of the form: [allfilespersubject, groupfiles]'''
    collsubs = files[0]
    group = files[1]
    # update group files
    data["general"] = group
    # update subject files
    for sub in FEDs:
        # extend content
        content_sub = [file for file in collsubs if re.match(fr'.*{sub}.*', file)]
        # update files in data
        data[sub]["files"] = content_sub
    # remove all unnecessary variables
    del collsubs, group, files

In [4]:
def build_data():
    '''Build the entire structure for data, general (for all) and per subject in FED'''

    # collect image files
    files = sorted([os.path.join(subdir, content)
                   for subdir in FED_dirs
                   for content in os.listdir(subdir)
                   if re.match(r'.*.(nii|nii.gz|json)$', content)])
    # for tissue probability and WM boundaries too (These are the originals! Processed files will be saved in FED_dir^^)
    tpms = sorted([os.path.join(subdir, tpm)
                   for subdir in TPM_dirs
                   for tpm in os.listdir(subdir)
                   if re.match(r'^(c1|c2).*.nii$', tpm)])
    # collect relevant fMRI parameter files group analysis files
    par_files = sorted([os.path.join(subdir, content)
                       for subdir in FED_dirs
                       for content in os.listdir(subdir)
                       if re.match(r'.*(par|matrix)$', content)])

    # extend files by the following
    files.extend(tpms)
    files.extend(par_files)

    # collect relevant general/group level files
    tpm_group = sorted([os.path.join(subdir, content)
                       for subdir in VBM_GROUP_dirs
                       for content in os.listdir(subdir)
                       if re.match(r'.*_6.nii$', content)])

    fmri_group = sorted([os.path.join(subdir, content)
                        for subdir in fMRI_GROUP_dirs
                        for content in os.listdir(subdir)
                        if re.match(r'.*(.nii).*', content)])


    # create a dictionary storing all files/subject and files concerning the entire analysis
    data = {}

    # collect general/group level files to data
    data["general"] = fmri_group + tpm_group

    # assign data from files via subject IDs
    for fed in FED_dirs:
        ID = fed.rsplit('/', 1)[1]
        # define partition for each subject
        subject = {"files": [], "parameters": {}}
        content = []
        for file in files:
            if re.match(fr'(.*{ID}.*)', file):
                content.append(file)
        subject["files"].extend(content)
        data[ID] = subject


    # create shortcut to FEDs
    FEDs = sorted([key for key in data.keys() if key != "general"])


    # finally, return the resulting structure and FED list
    return data, FEDs

# Build the initial data structure that holds all relevant information ("I" - prefix to all steps related)

In [5]:
# define base diectory for analysis toolboxes
fsldir = "/usr/share/fsl/5.0/"
spmdir = "/opt/spm12-r7219/spm12_mcr/spm12/"
fsldatadir = f"{fsldir}data/"  # data directory in bash's $FSLDIR
fsletcdir = f"{fsldir}etc/"  # this FSL directory contains other useful stuff. like alternative schedules etc.
tpmdir = f"{spmdir}tpm/"  # data directory for SPM's TPM material

# for relevant data files
scriptdatadir = "/home/martin/FED/"
credentials = f"{scriptdatadir}FED_Subject_Covariates.xls"
modelinfo = f"{scriptdatadir}FED_Day_2_modelparams.xls"

# for the basic data directories
basedir = "/fMRI/"
vbmdir = "/VBM/"

# define fMRI group-level directories
fMRI_GROUP_dirs = sorted([os.path.join(basedir, GROUP)
                          for GROUP in os.listdir(basedir)
                          if os.path.isdir(os.path.join(basedir, GROUP))
                          and re.match(r'.*fMRI/(?!(FED|Nipype2FSL)).*', os.path.join(basedir, GROUP))])
# define VBM group-level directories
VBM_GROUP_dirs = sorted([os.path.join(vbmdir, GROUP)
                          for GROUP in os.listdir(vbmdir)
                          if os.path.isdir(os.path.join(vbmdir, GROUP))
                          and re.match(r'.*VBM/.*template.*', os.path.join(vbmdir, GROUP))])
# define functional subject directories
FED_dirs = sorted([os.path.join(basedir, FED)
                  for FED in os.listdir(basedir)
                  if os.path.isdir(os.path.join(basedir, FED))
                  and re.match(r'.*fMRI/(?!(MNI|Nipype2FSL|Grouplevel)).*', os.path.join(basedir, FED))])
# define VBM subject directories
TPM_dirs = sorted([os.path.join(vbmdir, FED)
                  for FED in os.listdir(vbmdir)
                  if os.path.isdir(os.path.join(vbmdir, FED))
                  and re.match(r'.*VBM/(?!(DARTEL|VBM).*)', os.path.join(vbmdir, FED))])
TPM_dirs = sorted([os.path.join(subdir, content)
                   for subdir in TPM_dirs
                   for content in os.listdir(subdir)
                   if os.path.isdir(os.path.join(subdir, content))
                   and re.match(r'.*newsegment.*', content)])

In [6]:
# build the initial data structure
data = build_data()[0]

# create a shortcut list for all FED subjects
FEDs = build_data()[1]

# safe an image of the current namespace to clean up at the end of this script
data_varspace_init = dir()

I. Get relevant file parameters

In [7]:
# define exclusion set for FEDs that do not meet the requirements
#GRE_excluded = []
EPI_excluded = []
T1_excluded = []

# FED control
for subject in FEDs:
    # collect relevant files/FED for parameter extraction
    #GREs = [file for file in data[subject]["files"]
            #if re.match(r'(.*(_e1|_e2(?!_ph)).*.json)', file)]
    EPIs = [file for file in data[subject]["files"]
            if re.match(r'(.*(FMRI).*.json)', file)]
    T1s = [file for file in data[subject]["files"]
           if re.match(r'(.*(T1_MPRAGE).*.json)', file)]

    # control number of files
    if len(EPIs) != 1:
        print(subject, "\n", f"Not exactly one FMRI sequence to read out ({len(EPIs)}).... investigate", "\n")
        EPI_excluded.append(subject)

    if len(T1s) != 1:
        print(subject, "\n", f"Not exactly one T1 sequence to read out ({len(T1s)}).... investigate", "\n")
        T1_excluded.append(subject)

    # There seem to be several omissions in the GRER_FIELD data -> investigate
#    elif len(GREs) < 2:
#        print(subject, "\n", f"Not enough GRE_FIELD sequences to read out ({len(GREs)}).... exclude/investigate", "\n")
#        GRE_excluded.append(subject)

    # if there is more than one pair of magnitude images (e1+e2),
    # take the first one as they are likely to be closer to the fMRI acquisition
#    elif len(GREs) > 2:
#        print(subject, "\n", "More than 2 GRE_FIELD sequences",
#              "("+str(len(GREs))+")", ".... select only first two for TE extraction", "\n")


# control presence of other relevant file content (presence of condition onset times, covariates of interest etc.)
# read relevant content
ID_cre = pd.read_excel(credentials, sheet_name="analysis", usecols=['Sub Num FED_XXX'])
ID_mod = pd.read_excel(modelinfo, sheet_name="analysis", usecols=['Sub Num FED_XXX'])
# bring content to list
ID_cre = ID_cre['Sub Num FED_XXX'].tolist()
ID_mod = ID_mod['Sub Num FED_XXX'].tolist()
# model values are not unique (onset timings -> multiple entries^^) -> FIX for subject ID control 
ID_mod = list(set(ID_mod))
# now for the control
for subject, cre, mod in zip_longest(FEDs, ID_cre, ID_mod):
    if subject[-1] != cre != mod:
        print("subject(s) missing!")
        print("First line of list-alignment:  ",subject,f"FED00{cre}",f"FED00{mod}")
        print("Control relevant files!")
        # control, but assume that the smaller value has to be eliminated,
        # because, apparently, there are no values for it in (at least) one data file
        FEDexcl = set(sorted([int(subject[-1]), cre, mod])[:-1])
        FEDexcl = [f"FED00{i}" for i in FEDexcl]
        # exclude respective subjects' functionals and structurals
        EPI_excluded.extend(FEDexcl)
        T1_excluded.extend(FEDexcl)
        break

# exclude FEDs based on file criteria
print("\n\n", "The following subjects where excluded from further analysis due to false file numbers or missing data: ",\
      "\n", "EPI: ", sorted(EPI_excluded), "T1: ", sorted(T1_excluded))

# update data and FEDs based on prior exclusion
[data.pop(sub) for sub in FEDs if sub in T1_excluded and sub in EPI_excluded]
FEDs = [sub for sub in FEDs if sub not in T1_excluded and sub not in EPI_excluded]
# re-define FED_dirs accordingly
FED_dirs = sorted([os.path.join(basedir, f"{fed}/") for fed in FEDs])

subject(s) missing!
First line of list-alignment:   FED006 FED006 FED007
Control relevant files!


 The following subjects where excluded from further analysis due to false file numbers or missing data:  
 EPI:  ['FED006'] T1:  ['FED006']


In [8]:
# control new subject list
print("The new subject list after initial data and image file checkup:\n")
print(FEDs)
print("\n")
print("That leaves a total of ", len(FEDs), "remaining subjects.")
print("\n")

The new subject list after initial data and image file checkup:

['FED007', 'FED008', 'FED009', 'FED010', 'FED011', 'FED012', 'FED013', 'FED014', 'FED015', 'FED016', 'FED017', 'FED018', 'FED019', 'FED020', 'FED021', 'FED022', 'FED023', 'FED024', 'FED025', 'FED026', 'FED027', 'FED028', 'FED029', 'FED030', 'FED031', 'FED032', 'FED033', 'FED034', 'FED035', 'FED036', 'FED037', 'FED038', 'FED039', 'FED040', 'FED041', 'FED042', 'FED043', 'FED044', 'FED045', 'FED046', 'FED047', 'FED048', 'FED049', 'FED050', 'FED051', 'FED052', 'FED053', 'FED054', 'FED055', 'FED056', 'FED057', 'FED058', 'FED059', 'FED060', 'FED061', 'FED062', 'FED063', 'FED064', 'FED065', 'FED066', 'FED067', 'FED068']


That leaves a total of  62 remaining subjects.




In [9]:
# Read json info data for all relevant parameters
# define info of interest
#GREspecs = ["EchoTime", "PhaseEncodingDirection"]
EPIspecs = ["EchoTime", "RepetitionTime", "EchoTrainLength",
            "PhaseEncodingSteps", "PhaseEncodingDirection",
            "DwellTime", "TotalReadoutTime", "EffectiveEchoSpacing", "PixelBandwidth"]
T1specs = ["EchoTime", "RepetitionTime",
           "PhaseEncodingSteps", "InversionTime", "PixelBandwidth"]
# create list to record missing parameters
missing_params = []
# create list to record multiple parameters
multiple_params = []

# Now for the parameter extraction
for subject in FEDs:
    # collect relevant files/FED for parameter extraction
    #GREs=[file for file in data[subject]["files"]
           #if re.match(r'(.*(_e1|_e2(?!_ph)).*.json)', file)]
    T1s = [file for file in data[subject]["files"]
           if re.match(r'(.*(T1_MPRAGE).*.json)', file)]
    EPIs = [file for file in data[subject]["files"]
            if re.match(r'(.*(FMRI).*.json)', file)]

    # if there is more than one pair of magnitude images (e1+e2),
    # take the first one as they are likely to be closer to the fMRI acquisition
#    if len(GREs) > 2:
#        print(subject, "\n", "More than 2 GRE_FIELD sequences",
#              "("+str(len(GREs))+")", ".... selecting first two for TE extraction", "\n")
#        GREs = GREs[0:2]

    # T1 PARAMETERS
    # collect parameters from the respective scan's json file
    for file in T1s:
        with open(file) as json_file:
            info = json.load(json_file)
            for param in T1specs:
                # put params into data; use capital letters of parameter name as indicator in key
                key = f"T1_{''.join([char for char in param if char.isupper()])}"
                # if key does not exist -> create list with parameter
                if key not in data[subject]["parameters"].keys():
                    try:
                        data[subject]["parameters"][key] = info[param]
                    # if parameter does not exist in json file -> missing_params
                    # append to GRE_excluded
                    except KeyError:
                        #print(f"{subject}'s json file does not specify {param}", "\n",
                             #"noting issue ... ")
                        missing_params.append(f"{subject}_T1-{param}")
                        T1_excluded.append(subject)
                        pass
                # if key does exist -> append parameter to the list
                if key in data[subject]["parameters"].keys():
                    # if second value and first value for the parameter don't match,
                    # there are either multiple .json files or multiple entries of the same parameter
                    if info[param] != data[subject]["parameters"][key]:
                        # print an info message
                        #print(f"{subject}'s json file does specify multiple entries under {key}:","\n",
                              #f'{data[subject]["parameters"][key]} and {info[param]}',"\n",
                             #"inspect files!!!")                            
                        multiple_params.append(f"{subject}_T1-{param}")
                    elif info[param] == data[subject]["parameters"][key]:
                        pass

    # EPI PARAMETERS
    # collect parameters from the respective scan's json file
    for file in EPIs:
        with open(file) as json_file:
            info = json.load(json_file)
            for param in EPIspecs:
                # put params into data; use capital letters of parameter name as indicator in key
                key = f"EPI_{''.join([char for char in param if char.isupper()])}"
                # if key does not exist -> create list with parameter
                if key not in data[subject]["parameters"].keys():
                    try:
                        data[subject]["parameters"][key] = info[param]
                    # if parameter does not exist in json file -> missing_params
                    # append to EPI_excluded
                    except KeyError:
                        #print(f"{subject}'s json file does not specify {param}", "\n",
                             #"noting issue ... ")
                        missing_params.append(f"{subject}_EPI-{param}")
                        EPI_excluded.append(subject)
                        pass
                # if key does exist -> append parameter to the list
                if key in data[subject]["parameters"].keys():
                    # if second value and first value for the parameter don't match,
                    # there are either multiple .json files or multiple entries of the same parameter
                    if info[param] != data[subject]["parameters"][key]:
                        # print an info message
                        #print(f"{subject}'s json file does specify multiple entries under {key}:","\n",
                              #f'{data[subject]["parameters"][key]} and {info[param]}',"\n",
                             #"inspect files!!!")                              
                        multiple_params.append(f"{subject}_EPI-{param}")
                    elif info[param] == data[subject]["parameters"][key]:
                        pass

# show missing parameters from json files:
print("The following parameters where not available from subjects' json files:",
      "\n",sorted(missing_params))
# show duplicate parameters from json files:
print("The following parameters where available multiple times from subjects' json files:",
      "\n",sorted(multiple_params))

# exclude FEDs based on file criteria
#print("\n\n", "The following subjects where excluded from further analysis due to false file numbers or missing parameters: ",\
#      "\n", "EPI: ", sorted(EPI_excluded), "\n\n",
#     len(EPI_excluded), "  subjects in total")

# update data and FEDs based on prior exclusion
#[data.pop(sub) for sub in FEDs if sub in T1_excluded and sub in EPI_excluded]
#FEDs=[sub for sub in FEDs if sub not in T1_excluded and sub not in EPI_excluded]

The following parameters where not available from subjects' json files: 
 ['FED063_EPI-DwellTime', 'FED064_EPI-DwellTime', 'FED065_EPI-DwellTime', 'FED066_EPI-DwellTime']
The following parameters where available multiple times from subjects' json files: 
 []


In [10]:
# Edit the read-in parameters into analysis format
for sub in FEDs:
    # calculate necessary parameters that are not in header information
    # deltaTE for GRE_FIELD echo-based comparison
    #data[subject]["parameters"]["DeltaTE"]= reduce(operator.sub, data[subject]["parameters"]["GRE_ET"])*-1*1000

    # transfer phase encoding directions from field axes to voxel axes
    # control field axes values
    #print(sub)
    #print(data[sub]["parameters"]["EPI_PED"])
    epi_phasecodedir = data[sub]["parameters"]["EPI_PED"]
    for char in epi_phasecodedir:
        if char == "i":
            data[sub]["parameters"]["EPI_PED"] = f"x{epi_phasecodedir[1:]}"
        elif char == "j":
            data[sub]["parameters"]["EPI_PED"] = f"y{epi_phasecodedir[1:]}"
        elif char == "k":
            data[sub]["parameters"]["EPI_PED"] = f"z{epi_phasecodedir[1:]}"

    # same for T1s
#    T1_phasecodedir = data[sub]["parameters"]["T1_PED"]
#    for char in T1_phasecodedir:
#        if char == "i":
#            data[sub]["parameters"]["T1_PED"] = f"x{T1_phasecodedir[1:]}"
#        elif char == "j":
#            data[sub]["parameters"]["T1_PED"] = f"y{T1_phasecodedir[1:]}"
#        elif char == "k":
#            data[sub]["parameters"]["T1_PED"] = f"z{T1_phasecodedir[1:]}"


# complete "DwellTime" entries for missing subjects
# extract FEDs and params from missing_params
mis_FED = [par.split('_', 1)[0] for par in missing_params]
mis_Par = [par.split('-', 1)[1] for par in missing_params]

# test the other FEDs' Dwelltimes and infer the missing value
for par in set(mis_Par):
    # get key to search
    key = f"EPI_{''.join([char for char in par if char.isupper()])}"
    # create list for parameter collection from all subjects' data
    collection = []
    for sub in FEDs:
        # collect all FED IDs and parameters in one list
        if key in data[sub]["parameters"]:
            collection.append([sub, data[sub]["parameters"][key]])
    # extract values for testing
    par_vals = [x[1] for x in collection]
    # if there is exactly one unique value, all values are identical
    if len(set(par_vals)) == 1:
        #print(f"The values for {par} seem to be identical ... Substituting missing parameters accordingly")
        # set that value for all empty FEDs
        rep_val = [val for val in set(par_vals)][0]
        for empty in mis_FED:
            data[empty]["parameters"][key] = rep_val
    # if there is more than one unique value, check stuff
    elif len(set(par_vals)) > 1:
        #print(f"The values for {par} are not identical ({len(set(par_vals))} different values).",
              #"\nfMRI acquisition may have been changed significantly over the course of data acquistion.")
        # assume that the omission in the json files happened during sequence alterations -> assign missing FEDs to latest group and latest value
        rep_val = [val for val in set(par_vals)][-1]
        for empty in mis_FED:
            data[empty]["parameters"][key] = rep_val

# inform about the parameter completion
print("\n", f"edited missing parameters {sorted(missing_params)} and inferred them from the other subjects.",
      "\n", "Parameter list is now full-rank.", "\n")

# transfer DwellTime to ms
for sub in FEDs:
    epi_dwelltime = data[sub]["parameters"]["EPI_DT"] * 1000
    #dwelltime_decimal = f"{epi_dwelltime:.8f}"
    data[sub]["parameters"]["EPI_DT"] = epi_dwelltime

# transfer Echotime to ms
for sub in FEDs:
    epi_echotime = data[sub]["parameters"]["EPI_ET"] * 1000
    #echotime_decimal = f"{epi_echotime:.8f}"
    data[sub]["parameters"]["EPI_ET"] = epi_echotime


 edited missing parameters ['FED063_EPI-DwellTime', 'FED064_EPI-DwellTime', 'FED065_EPI-DwellTime', 'FED066_EPI-DwellTime'] and inferred them from the other subjects. 
 Parameter list is now full-rank. 



I. Get / create relevant covariates

In [11]:
# create covariates (FSL -> "EVs")
# get sex, age, depression, time since clinical episode, severity of clinical episodes, number of clinical episodes, - data from list in .xlsx file(s)

# files are already defined ^^
# read relevant content
content_cre = pd.read_excel(credentials, sheet_name="analysis",
                            usecols = ['Sub Num FED_XXX', 'Gender', 'Age', 'BDI 22 Score'])
content_mod = pd.read_excel(modelinfo, sheet_name="analysis",
                            usecols = ['Sub Num FED_XXX', 'Condition', 'RT', 'COTcorrect'])

# sort panda dataframe according to file sequence in FEDs (account for excluded)
# define sort-by list (get FED_ID and format to fit entries in dataframe)
model_FED_ID = [i[-3:].lstrip("0") for i in FEDs]
# transform values to integers to get values in model_FED_ID
model_FED_ID = [int(i) for i in model_FED_ID]
# define a categorical variable to sort a column and corresponding lines after
content_cre['model_FED_ID'] = pd.Categorical(content_cre['Sub Num FED_XXX'],
                                             categories = model_FED_ID, ordered=True)
content_mod['model_FED_ID'] = pd.Categorical(content_mod['Sub Num FED_XXX'],
                                             categories = model_FED_ID, ordered=True)
# sort dataframes by indicator variables
content_cre.sort_values(['model_FED_ID'], inplace=True)
content_mod.sort_values(['model_FED_ID', 'Condition', 'COTcorrect'], inplace=True)

# split experimental data by subject and create separate dataframes for each
for sub, datasub in zip(FEDs, model_FED_ID):
    # add the dataframe as additional item into subject's parameter dict
    data[sub]["parameters"]["covariates"] = content_cre[content_cre['model_FED_ID'] == datasub]
    data[sub]["parameters"]["modelparams"] = content_mod[content_mod['model_FED_ID'] == datasub]

In [12]:
# create Bunch objects out of modelparams
for sub in FEDs:
    modelinfo = data[sub]["parameters"]["modelparams"]
    # break conditions down to single occurences
    conditions = modelinfo["Condition"].unique().astype(str).tolist()
    # make one list for each condition and rescale to secs
    onsets = [modelinfo[modelinfo["Condition"] == int(i)]["COTcorrect"].div(1000).tolist() for i in conditions]
    # fill with duration to length of onsets and its elements
    durations = [[1.5 for timing in range(0, len(trial))] for trial in onsets]  # what timeframe do you want to look at?
    # build Bunch
    trial_info = [Bunch(conditions = conditions,
                        onsets = onsets,
                        durations = durations)]
    # add modelparameters in data with the newly created model_info Bunch
    data[sub]["parameters"]["trialinfo"] = trial_info

# Build the changes to this data structure evoked by various stages within the pre-processing pipeline ("II" - prefix to all steps related)

II. Post-ART: Detect and remove subject outliers based on motion artefacts

In [13]:
# print length of outlier file for each subject to determine number of timepoints deemed outliers
# by the current rapidart routine
# define list of outlier numbers
art_detect = {}
outlier_nums = []

for sub, subdir in zip(FEDs, FED_dirs):
    artdetect_dir = subdir + "artdetect/"
    if os.path.exists(artdetect_dir):
        for file in os.listdir(artdetect_dir):
            if re.match(r'.*outliers.txt', file):
                outlier_file = os.path.join(artdetect_dir, file)
                with open(outlier_file) as outliers:
                    # amount of outliers is equal to the length of the outlier file
                    amount = len(outliers.readlines())
                    # for later comparison
                    art_detect[sub] = amount
                    # for present analysis
                    outlier_nums.append(amount)
                    #print(f"For {subdir}, {amount} ",
                          #"outlier timepoints have been identified at the current rapidart-settings.")
    else:
        print(f"There is no directory named {artdetect_dir} yet",
              "Check whether motion correction and outlier detection via rapidart have already been performed")

In [14]:
# write loop that executes only if the required files exist for each subject
if len(art_detect.keys()) == len(FEDs) and len(outlier_nums) == len(FEDs):
    # create z-scores from outlier numbers and determine outlying subjects to be removed from further analysis
    # transform list into np.array
    outlier_zs = np.abs(sp.stats.zscore(outlier_nums))

    # print values exceeding a custom threshold
    out_threshold = 2  # standard definition of z-based outliers
    excess_motion = outlier_zs[outlier_zs > out_threshold]

    # form a combined list (nums & zs have the same orientation and size) and identify the outlier(s)
    outlier_combined = [[a, b] for a, b in zip(outlier_nums, outlier_zs)]
    to_be_removed = [x[0] for x in outlier_combined if x[1] in excess_motion]

    # compare art_detect with to_be_removed and identify the FED subject ID
    for sub in FEDs:
        if art_detect[sub] in to_be_removed:
            print("\n")
            print(f"{sub}, with {art_detect[sub]} motion-related outliers, ",
                  f"exceeds an outlier threshold of {out_threshold}z in a normal distribution of the outlier numbers.",
                  f"\n{sub} will therefore be excluded from further analysis")
            # remove subject from data
            data.pop(sub)

# update FEDs
FEDs = [sub for sub in FEDs if sub in data.keys()]
# re-define FED_dirs accordingly
FED_dirs = sorted([os.path.join(basedir, f"{fed}/") for fed in FEDs])



FED048, with 44 motion-related outliers,  exceeds an outlier threshold of 2z in a normal distribution of the outlier numbers. 
FED048 will therefore be excluded from further analysis


In [15]:
# control new subject list after motion artefact correction
print("\n")
print("The new subject list after motion correction and additional parameter calculations:\n")
print(FEDs)
print("\n")
print("That leaves a total of ", len(FEDs), "remaining subjects.")
print("\n")



The new subject list after motion correction and additional parameter calculations:

['FED007', 'FED008', 'FED009', 'FED010', 'FED011', 'FED012', 'FED013', 'FED014', 'FED015', 'FED016', 'FED017', 'FED018', 'FED019', 'FED020', 'FED021', 'FED022', 'FED023', 'FED024', 'FED025', 'FED026', 'FED027', 'FED028', 'FED029', 'FED030', 'FED031', 'FED032', 'FED033', 'FED034', 'FED035', 'FED036', 'FED037', 'FED038', 'FED039', 'FED040', 'FED041', 'FED042', 'FED043', 'FED044', 'FED045', 'FED046', 'FED047', 'FED049', 'FED050', 'FED051', 'FED052', 'FED053', 'FED054', 'FED055', 'FED056', 'FED057', 'FED058', 'FED059', 'FED060', 'FED061', 'FED062', 'FED063', 'FED064', 'FED065', 'FED066', 'FED067', 'FED068']


That leaves a total of  61 remaining subjects.


