In [1]:
import os
import pandas as pd
import numpy as np
import pcntoolkit as ptk 
from pcntoolkit.util.utils import create_design_matrix

# globals
root_dir = '/project_cephfs/3022017.06/ENIGMA_ANX/Z_stat/'
data_dir = os.path.join(root_dir,'data/')
mask_nii = ('/opt/fmriprep/templateflow/tpl-MNI152NLin2009cAsym/tpl-MNI152NLin2009cAsym_res-02_desc-brain_mask.nii.gz')

proc_dir = os.path.join(root_dir)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
##############################
# load  training and test
##############################
df_tr = pd.read_csv(os.path.join(data_dir,'metadata_tr.csv'))
df_te = pd.read_csv(os.path.join(data_dir,'metadata_te.csv'))

#Make sure that all the columns are numerical not string
#columns_to_convert = ['Healthy_or_patient', 'Age', 'Sex', 'Trauma_exposed', 'MRI', 'Instructions', 
#                      'Precond_number_trials', 'Multiple_CSplus', 'Multiple_CSminus', 
#                      'CS_type_neutral_faces', 'CS_type_neutral_pictures', 'CS_type_neutral_male_avatar', 
#                      'CS_type_snakes_spiders', 'CS_type_gabor_patch', 'CS_type_animal_tool', 
#                      'CS_type_affective_faces_pictures', 'CS_type_humanoic_characters', 
#                      'Number_CSplus_cond', 'Number_CSminus_cond', 'Reinforcement_Rate',
#                      'US_type_electric_shock', 'US_type_auditory', 'US_type_visceral', 
#                      'US_type_thermal', 'Average_ITI', 'Average_ISI'] #'Reinforcing_rate'

# Count NaN values before conversion
#nan_before = df_tr[columns_to_convert].isna().sum()
#nan_before = df_te[columns_to_convert].isna().sum()

# Convert the specified columns to numeric, coercing errors to NaN
#df_tr[columns_to_convert] = df_tr[columns_to_convert].apply(pd.to_numeric, errors='coerce')
#df_te[columns_to_convert] = df_te[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Count NaN values after conversion
#nan_after = df_tr[columns_to_convert].isna().sum()
#nan_after = df_te[columns_to_convert].isna().sum()

# Calculate the number of NaN values introduced
#nan_introduced = nan_after - nan_before
#nan_introduced = nan_after - nan_before

# Display the results
#print("NaN values introduced in each column:")
#print(nan_introduced)

#print(df_te)

In [3]:
######################
# Configure covariates
######################
# design matrix parameters
xmin = 3 #REAL: 8 # boundaries for ages of participants +/- 5
xmax = 71 #REAL:66
cols_cov = ["Age", 
            "Sex",
            "MRI", 
            "Instructions",
            "Precond_number_trials",
            "Multiple_CSplus", 
            "Multiple_CSminus",
            "CS_type_neutral_faces",
            "CS_type_neutral_pictures",
            "CS_type_neutral_male_avatar",
            "CS_type_snakes_spiders",
            "CS_type_gabor_patch",
            "CS_type_animal_tool",
            "CS_type_affective_faces_pictures",
            "CS_type_humanoic_characters",
            "Number_CSplus_cond",
            "Number_CSminus_cond",
            "Reinforcing_rate",
            "US_type_electric_shock", 
            "US_type_auditory", 
            "US_type_visceral",
            "US_type_thermal", 
            "Average_ITI", 
            "Average_ISI",
            "Potential_US_confound"] 
site_ids =  sorted(set(df_tr['Group_Dataset'].to_list())) 

print('configuring covariates ...')
X_tr = create_design_matrix(df_tr[cols_cov], site_ids = df_tr['Group_Dataset'],
                            basis = 'bspline', xmin = xmin, xmax = xmax)
print(X_tr)
X_te = create_design_matrix(df_te[cols_cov], site_ids = df_te['Group_Dataset'], all_sites=site_ids,
                            basis = 'bspline', xmin = xmin, xmax = xmax)

cov_file_tr = os.path.join(proc_dir, 'cov_bspline_tr.txt')
cov_file_te = os.path.join(proc_dir, 'cov_bspline_te.txt')
ptk.dataio.fileio.save(X_tr, cov_file_tr)
ptk.dataio.fileio.save(X_te, cov_file_te)

configuring covariates ...
[[1.00000000e+00 1.80000000e+01 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.20000000e+01 1.00000000e+00 ... 2.71388832e-04
  0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.20000000e+01 0.00000000e+00 ... 2.71388832e-04
  0.00000000e+00 0.00000000e+00]
 ...
 [1.00000000e+00 5.20000000e+01 1.00000000e+00 ... 5.96393921e-01
  1.71738245e-01 0.00000000e+00]
 [1.00000000e+00 2.90000000e+01 1.00000000e+00 ... 2.47303073e-02
  0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 4.40000000e+01 1.00000000e+00 ... 4.16598819e-01
  1.74536943e-02 0.00000000e+00]]


In [5]:
#########################
# configure response data
#########################
## load the response data as nifti - Train
data_nii_tr = []
data_nii_tr.append(os.path.join(data_dir, 'ENIGMA_FC_tr_1.nii.gz')) #concatenate the 4D nii files
data_nii_tr.append(os.path.join(data_dir, 'ENIGMA_FC_tr_2.nii.gz')) #concatenate the 4D nii files

# load the response data as nifti
print('loading wholebrain response data ...') 
for i, f in enumerate(data_nii_tr):
    print('loading study', i, '[', f, '] ...')
    if i == 0:
        x_tr = ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T
        print(x_tr.shape)
        #x = ptk.dataio.fileio.load_nifti(f, mask=None, vol=False).T #without the  vol=False
    else: 
        x_tr1 = ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T
        print(x_tr1.shape)
        x_tr = np.concatenate((x_tr, ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T))
        print(x_tr.shape)
        #x =  np.concatenate((x, ptk.dataio.fileio.load_nifti(f, mask=None, vol=False).T)) #without the  vol=False
# and write out as pkl
resp_file_tr = os.path.join(proc_dir,'resp_tr.pkl')
ptk.dataio.fileio.save(x_tr, resp_file_tr)

loading wholebrain response data ...
loading study 0 [ /project_cephfs/3022017.06/ENIGMA_ANX/Z_stat/data/ENIGMA_FC_tr_1.nii.gz ] ...
(447, 235840)
loading study 1 [ /project_cephfs/3022017.06/ENIGMA_ANX/Z_stat/data/ENIGMA_FC_tr_2.nii.gz ] ...
(447, 235840)
(894, 235840)


In [6]:
## load the response data as nifti - Test
data_nii_te = []
data_nii_te.append(os.path.join(data_dir, 'ENIGMA_FC_te_1.nii.gz'))
data_nii_te.append(os.path.join(data_dir, 'ENIGMA_FC_te_2.nii.gz'))

# load the response data as nifti
print('loading wholebrain response data ...') 
for i, f in enumerate(data_nii_te):
    print('loading study', i, '[', f, '] ...')
    if i == 0:
        x_te = ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T
        print(x_te.shape)
        #x = ptk.dataio.fileio.load_nifti(f, mask=None, vol=False).T #without the  vol=False
    else: 
        x_te1 = ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T
        print(x_te1.shape)
        x_te = np.concatenate((x_te, ptk.dataio.fileio.load(f, mask=mask_nii, vol=False).T))
        print(x_te.shape)
# and write out as pkl
resp_file_te = os.path.join(proc_dir,'resp_te.pkl')
ptk.dataio.fileio.save(x_te, resp_file_te)

loading wholebrain response data ...
loading study 0 [ /project_cephfs/3022017.06/ENIGMA_ANX/Z_stat/data/ENIGMA_FC_te_1.nii.gz ] ...
(323, 235840)
loading study 1 [ /project_cephfs/3022017.06/ENIGMA_ANX/Z_stat/data/ENIGMA_FC_te_2.nii.gz ] ...
(323, 235840)
(646, 235840)
