In [111]:
import numpy as np
import pandas as pd

from pheno_clean_funcs import AddFileNames
import os
import re
from PIL import Image
import nibabel as nib
import numpy as np

In [112]:
parent_dir = os.path.dirname(os.getcwd()) # get path to parent directory


## Phenotypic Data CSV cleaning
Phenotypic data is stored in a csv file, that requires pre-processing, for ease of reference I shall add filenames of extracted
data, into columns, and deal with any missing data.

### Raw phenotypic data

Make object containing the raw dataframe, this object instance can be worked on to produce final output, new features can be added based on extracted data.

In [113]:
raw = AddFileNames(parent_dir, 'phenotype_files', 'Phenotypic_V1_0b_preprocessed.csv')

In [114]:
raw_df = raw.df

In [115]:
raw_df.head() # at this stage only the subject identification features and  diagnosis classes are needed.

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_rater_1,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,fail,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,OK,,OK,,OK,,OK,,OK,
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,OK,,OK,,OK,,OK,,OK,
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,OK,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,OK,,OK,,maybe,ic-parietal slight,OK,,OK,


In [116]:
st1_df = raw_df[['SITE_ID','X','SUB_ID','FILE_ID','AGE_AT_SCAN','SEX','DSM_IV_TR','DX_GROUP']] # extract necessary data

In [117]:
raw.update_df(st1_df) # update object instant dataframe with new df

In [118]:
stage_1 = raw.df # store updated df in new variable

## Adding FMRI filenames and updating File_ID field
File_ID data is neccesary  for data analysis since its an identifier for any data on a subject, and this field contains missing values. these missing values can be imputed using the fMRi data filenames.

In [119]:
fMRI_dir = os.path.join(parent_dir, 'func_ABIDE') # subdirectory containing fMRI data

In [120]:
fMRI_files= next(os.walk(fMRI_dir))[2] # acquire all fMRI files
fMRI_files = [fMRI_files[i] for i in range(len(fMRI_files)) if 'nii.gz' in fMRI_files[i]] # limit to nifti files

In [121]:
print(f' Number of fMRI files : {len(fMRI_files)} \n Number of rows in phenotypic file: {len(stage_1)}')

 Number of fMRI files : 1102 
 Number of rows in phenotypic file: 1112


There are more subjects in the raw phenotypic data than there are fMRI files, these missing data pertains to fMRI scans that did not pass Quality control checks following preproccing in the CPAC pipeline. the subject with fMRI data will be removed.

In [122]:
present = []  # indexes of rows with a corresponding fMRI file
for i in range(len(fMRI_files)):
    here = re.search('5\d+\d', fMRI_files[
        i]).group()  # use regex, every subject ID start with 5 following by unique digit combination
    present.append(int(here))

absent = []  # missing rows
for i in range(len(stage_1)):
    if stage_1.loc[i, 'SUB_ID'] not in present:
        absent.append(i)
len(stage_1) - len(absent) == len(fMRI_files)

True

In [123]:
stage_1.iloc[absent]  # rows that did not pass quality control

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP
909,UCLA_1,910,51232,no_filename,15.79,1,1,1
910,UCLA_1,911,51233,no_filename,12.75,1,1,1
919,UCLA_1,920,51242,no_filename,14.14,2,1,1
920,UCLA_1,921,51243,no_filename,11.69,1,1,1
921,UCLA_1,922,51244,no_filename,13.09,1,1,1
922,UCLA_1,923,51245,no_filename,14.98,1,1,1
923,UCLA_1,924,51246,no_filename,13.1,1,1,1
924,UCLA_1,925,51247,no_filename,11.97,1,1,1
947,UCLA_1,948,51270,no_filename,11.08,1,0,2
979,UCLA_2,980,51310,no_filename,11.7,1,0,2


In [124]:
stage_1.drop(absent,inplace = True) # drop all rows with missing data, inplace
stage_1.reset_index(drop= True,inplace = True) # reset index

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stage_1.drop(absent,inplace = True) # drop all rows with missing data, inplace


In [125]:
raw.update_df(stage_1) # update object instance

In [126]:
len(raw.df) # it worked

1102

In [127]:
raw.df

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP
0,PITT,1,50002,no_filename,16.77,1,1,1
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1
...,...,...,...,...,...,...,...,...
1097,SBL,1108,51583,SBL_0051583,35.00,1,2,1
1098,SBL,1109,51584,SBL_0051584,49.00,1,2,1
1099,SBL,1110,51585,SBL_0051585,27.00,1,1,1
1100,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1


In [128]:
raw.add_feature('FMRI_FILES', 'func_ABIDE', '.nii.gz') #use method `add_feature` to update class and add all filenames

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[feature_name] = feature_file_indexed


Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES
0,PITT,1,50002,no_filename,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz
...,...,...,...,...,...,...,...,...,...
1097,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz
1098,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz
1099,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz
1100,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz


In [129]:
stage_2 = raw.df
stage_2.head()

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES
0,PITT,1,50002,no_filename,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz


## Impute FILE_ID data

In [130]:
file_ids = stage_2['FILE_ID']

In [131]:
missing_index = []
for i,file in enumerate(file_ids):
    pattern = re.search('5\d+\d',file) # use regex to search if value contains a string matching subject id
    if pattern == None:
        missing_index.append(i)

In [132]:
stage_2.loc[missing_index,'FILE_ID'].unique() # all missing values designated as no_filename

array(['no_filename'], dtype=object)

In [133]:
len(missing_index)

67

In [134]:
stage_2.loc[missing_index,['FILE_ID', 'FMRI_FILES']] # all missing file_ids contain fmri filenames containg desired id

Unnamed: 0,FILE_ID,FMRI_FILES
0,no_filename,Pitt_0050002_func_preproc.nii.gz
63,no_filename,Olin_0050108_func_preproc.nii.gz
92,no_filename,Olin_0050137_func_preproc.nii.gz
104,no_filename,OHSU_0050155_func_preproc.nii.gz
114,no_filename,OHSU_0050165_func_preproc.nii.gz
...,...,...
1025,no_filename,MaxMun_a_0051367_func_preproc.nii.gz
1026,no_filename,MaxMun_a_0051368_func_preproc.nii.gz
1029,no_filename,MaxMun_a_0051371_func_preproc.nii.gz
1030,no_filename,MaxMun_a_0051372_func_preproc.nii.gz


In [135]:
for i in missing_index:
    correct_id = stage_2.loc[i,'FMRI_FILES'].replace('_func_preproc.nii.gz', '') # take subject id potion of the filenames
    stage_2.loc[i, 'FILE_ID'] = correct_id # change the file ids at indexes with missing values to correct id

In [136]:
stage_2.loc[missing_index] # it worked

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz
63,OLIN,64,50108,Olin_0050108,21.00,1,0,2,Olin_0050108_func_preproc.nii.gz
92,OLIN,93,50137,Olin_0050137,20.00,1,0,1,Olin_0050137_func_preproc.nii.gz
104,OHSU,105,50155,OHSU_0050155,14.42,1,-9999,1,OHSU_0050155_func_preproc.nii.gz
114,OHSU,115,50165,OHSU_0050165,9.69,1,-9999,2,OHSU_0050165_func_preproc.nii.gz
...,...,...,...,...,...,...,...,...,...
1025,MAX_MUN,1036,51367,MaxMun_a_0051367,21.00,1,0,2,MaxMun_a_0051367_func_preproc.nii.gz
1026,MAX_MUN,1037,51368,MaxMun_a_0051368,28.00,2,0,2,MaxMun_a_0051368_func_preproc.nii.gz
1029,MAX_MUN,1040,51371,MaxMun_a_0051371,48.00,1,0,2,MaxMun_a_0051371_func_preproc.nii.gz
1030,MAX_MUN,1041,51372,MaxMun_a_0051372,46.00,2,0,2,MaxMun_a_0051372_func_preproc.nii.gz


In [137]:
raw.update_df(stage_2)

In [138]:
raw.df

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz
...,...,...,...,...,...,...,...,...,...
1097,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz
1098,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz
1099,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz
1100,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz


## Adding filenames of CC200 atlas time series data
- subdirectory containing data: 'rois_cc200'
- feature name to be added to dataframe: 'CC200'
- file extension: '.1D'

In [139]:
raw.add_feature('CC200','rois_cc200', '.1D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[feature_name] = feature_file_indexed


Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz,Pitt_0050002_rois_cc200.1D
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz,Pitt_0050003_rois_cc200.1D
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz,Pitt_0050004_rois_cc200.1D
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz,Pitt_0050005_rois_cc200.1D
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz,Pitt_0050006_rois_cc200.1D
...,...,...,...,...,...,...,...,...,...,...
1097,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz,SBL_0051583_rois_cc200.1D
1098,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz,SBL_0051584_rois_cc200.1D
1099,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz,SBL_0051585_rois_cc200.1D
1100,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz,MaxMun_a_0051606_rois_cc200.1D


In [140]:
stage_3 = raw.df

In [141]:
stage_3

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz,Pitt_0050002_rois_cc200.1D
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz,Pitt_0050003_rois_cc200.1D
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz,Pitt_0050004_rois_cc200.1D
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz,Pitt_0050005_rois_cc200.1D
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz,Pitt_0050006_rois_cc200.1D
...,...,...,...,...,...,...,...,...,...,...
1097,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz,SBL_0051583_rois_cc200.1D
1098,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz,SBL_0051584_rois_cc200.1D
1099,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz,SBL_0051585_rois_cc200.1D
1100,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz,MaxMun_a_0051606_rois_cc200.1D


In [142]:
raw.pheno_dir

'/Users/admin/Documents/Project/phenotype_files'

In [143]:
save_path = os.path.join(raw.pheno_dir, 'pheno_clean.csv') # path for saving this dataframe

In [144]:
stage_3.to_csv(save_path, index= False)

## Removing data with low temporal resolution
Referring to Notebook `EDA` there are fMRI files that contain less than 100 time points, this is lower than the required temporal resolution for calculating an accurate measure of dynamic correlation, so these shall be removed.


In [145]:
fMRI_dir = os.path.join(parent_dir, 'func_ABIDE') # subdirectory containing fMRI data

In [146]:
time_stage = AddFileNames(parent_dir, 'phenotype_files' ,'pheno_clean.csv') # create new instance of AddFileNames module

In [147]:
t_stage_df = time_stage.df

In [148]:
t_stage_df.head()

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz,Pitt_0050002_rois_cc200.1D
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz,Pitt_0050003_rois_cc200.1D
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz,Pitt_0050004_rois_cc200.1D
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz,Pitt_0050005_rois_cc200.1D
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz,Pitt_0050006_rois_cc200.1D


In [149]:
fmri_files = t_stage_df['FMRI_FILES'] #list of fmri file names

In [150]:
times = []
for i in range(len(fmri_files)):
    x = os.path.join(fMRI_dir, fmri_files[i])# make path to fmri file
    y = nib.load(x) #load nifit file using nibabel
    dim = y.shape # get dimensions of fmri
    times.append(dim[-1])# get size of last dimension time

low_indexes = []
low_times = []
for i, time in enumerate(times):
    if times[i] < 116:
        low_indexes.append(i)
        low_times.append(time)

In [151]:
len(fmri_files),len(low_indexes)

(1102, 28)

In [152]:
np.average(low_times)

78.0

In [153]:
t_stage_df.loc[low_indexes] # The data with low temporal resolution

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200
93,OHSU,94,50142,OHSU_0050142,13.99,1,-9999,1,OHSU_0050142_func_preproc.nii.gz,OHSU_0050142_rois_cc200.1D
94,OHSU,95,50143,OHSU_0050143,13.79,1,-9999,1,OHSU_0050143_func_preproc.nii.gz,OHSU_0050143_rois_cc200.1D
95,OHSU,96,50144,OHSU_0050144,10.22,1,-9999,1,OHSU_0050144_func_preproc.nii.gz,OHSU_0050144_rois_cc200.1D
96,OHSU,97,50145,OHSU_0050145,10.75,1,-9999,1,OHSU_0050145_func_preproc.nii.gz,OHSU_0050145_rois_cc200.1D
97,OHSU,98,50146,OHSU_0050146,8.0,1,-9999,1,OHSU_0050146_func_preproc.nii.gz,OHSU_0050146_rois_cc200.1D
98,OHSU,99,50147,OHSU_0050147,11.35,1,-9999,1,OHSU_0050147_func_preproc.nii.gz,OHSU_0050147_rois_cc200.1D
99,OHSU,100,50148,OHSU_0050148,12.65,1,-9999,1,OHSU_0050148_func_preproc.nii.gz,OHSU_0050148_rois_cc200.1D
100,OHSU,101,50149,OHSU_0050149,12.32,1,-9999,1,OHSU_0050149_func_preproc.nii.gz,OHSU_0050149_rois_cc200.1D
101,OHSU,102,50150,OHSU_0050150,9.42,1,-9999,1,OHSU_0050150_func_preproc.nii.gz,OHSU_0050150_rois_cc200.1D
102,OHSU,103,50152,OHSU_0050152,9.73,1,-9999,1,OHSU_0050152_func_preproc.nii.gz,OHSU_0050152_rois_cc200.1D


In [154]:
t_stage_df.drop(low_indexes, inplace=True) # Drop the rows with low temporal resolution

In [155]:
t_stage_df.reset_index(inplace= True, drop=True)# reset index

In [156]:
save_path = os.path.join(time_stage.pheno_dir, 'pheno_clean.csv') # path for saving df as csv

In [157]:
t_stage_df.to_csv(save_path,index=False) # save altered dataframe as csv

## Adding pickle filenames containing DFC data

In [158]:
dfc_add = AddFileNames(parent_dir, 'phenotype_files' ,'pheno_clean.csv') # create new instance of AddFileNames module

In [159]:
dfc_add.df

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz,Pitt_0050002_rois_cc200.1D
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz,Pitt_0050003_rois_cc200.1D
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz,Pitt_0050004_rois_cc200.1D
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz,Pitt_0050005_rois_cc200.1D
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz,Pitt_0050006_rois_cc200.1D
...,...,...,...,...,...,...,...,...,...,...
1069,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz,SBL_0051583_rois_cc200.1D
1070,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz,SBL_0051584_rois_cc200.1D
1071,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz,SBL_0051585_rois_cc200.1D
1072,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz,MaxMun_a_0051606_rois_cc200.1D


In [160]:
dfc_add.add_feature('DFC_DATA_STORE', 'dfc_cc200','.pkl')# add pickle files containing DFC data dictionaries

Unnamed: 0,SITE_ID,X,SUB_ID,FILE_ID,AGE_AT_SCAN,SEX,DSM_IV_TR,DX_GROUP,FMRI_FILES,CC200,DFC_DATA_STORE
0,PITT,1,50002,Pitt_0050002,16.77,1,1,1,Pitt_0050002_func_preproc.nii.gz,Pitt_0050002_rois_cc200.1D,Pitt_0050002_dfc.pkl
1,PITT,2,50003,Pitt_0050003,24.45,1,1,1,Pitt_0050003_func_preproc.nii.gz,Pitt_0050003_rois_cc200.1D,Pitt_0050003_dfc.pkl
2,PITT,3,50004,Pitt_0050004,19.09,1,1,1,Pitt_0050004_func_preproc.nii.gz,Pitt_0050004_rois_cc200.1D,Pitt_0050004_dfc.pkl
3,PITT,4,50005,Pitt_0050005,13.73,2,1,1,Pitt_0050005_func_preproc.nii.gz,Pitt_0050005_rois_cc200.1D,Pitt_0050005_dfc.pkl
4,PITT,5,50006,Pitt_0050006,13.37,1,1,1,Pitt_0050006_func_preproc.nii.gz,Pitt_0050006_rois_cc200.1D,Pitt_0050006_dfc.pkl
...,...,...,...,...,...,...,...,...,...,...,...
1069,SBL,1108,51583,SBL_0051583,35.00,1,2,1,SBL_0051583_func_preproc.nii.gz,SBL_0051583_rois_cc200.1D,SBL_0051583_dfc.pkl
1070,SBL,1109,51584,SBL_0051584,49.00,1,2,1,SBL_0051584_func_preproc.nii.gz,SBL_0051584_rois_cc200.1D,SBL_0051584_dfc.pkl
1071,SBL,1110,51585,SBL_0051585,27.00,1,1,1,SBL_0051585_func_preproc.nii.gz,SBL_0051585_rois_cc200.1D,SBL_0051585_dfc.pkl
1072,MAX_MUN,1111,51606,MaxMun_a_0051606,29.00,2,2,1,MaxMun_a_0051606_func_preproc.nii.gz,MaxMun_a_0051606_rois_cc200.1D,MaxMun_a_0051606_dfc.pkl


In [161]:
pkl_df = dfc_add.df

In [162]:
pkl_df = pkl_df.iloc[:,2:]

In [163]:
pkl_df.index

RangeIndex(start=0, stop=1074, step=1)

In [164]:
save_path = os.path.join(dfc_add.pheno_dir, 'pheno_clean.csv')  # path for saving df as csv
pkl_df.to_csv(save_path, index= False)  # save altered dataframe as csv