JN by Minne Schepers

For importing the behavioral data from the HCP data and combining columns from these files into a new dataframe

For futher information on what columns contain what value see:
LS2.0_Crosswalk_Behavioral_Data_Dictionary available at https://wiki.humanconnectome.org/display/PublicData/HCP-YA+Data+Dictionary-+Updated+for+the+1200+Subject+Release#HCPYADataDictionaryUpdatedforthe1200SubjectRelease-Instrument:Self-regulation/Impulsivity(DelayDiscounting)

In [689]:
import pandas as pd
import numpy as np

In [690]:
# Import all different dataframes from the HCP data
moca = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/moca01.txt",
                     sep='\t', header=0)[1:]
cogcomp = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/cogcomp01.txt",
                     sep='\t', header=0)[1:]
dccs = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/dccs01.txt",
                     sep='\t', header=0)[1:]
deldisk = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/deldisk01.txt",
                     sep='\t', header=0)[1:]
flanker = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/flanker01.txt",
                     sep='\t', header=0)[1:]
lswmt = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/lswmt01.txt",
                     sep='\t', header=0)[1:]
orrt = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/orrt01.txt",
                     sep='\t', header=0)[1:]
pcps = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/pcps01.txt",
                     sep='\t', header=0)[1:]
psm = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/psm01.txt",
                     sep='\t', header=0)[1:]
tpvt = pd.read_csv("/Users/minneschepers/Downloads/Package_1195305/tpvt01.txt",
                     sep='\t', header=0)[1:]

In [691]:
# From all dataframes include the subject ID, age and sex of every available subject
# By merging all dataframes together and then dropping the duplicates

all_dfs = [moca, cogcomp, dccs, deldisk, flanker, lswmt, orrt, pcps, psm, tpvt]

to_include = ['src_subject_id', 'sex', 'interview_age']

df = pd.DataFrame()

for i in all_dfs:
    i = i[to_include]
    df = df.append(i, ignore_index=True)

df = df.drop_duplicates()
print(df.shape)

(725, 3)


In [692]:
# Convert age to age in years and include in df

df['interview_age'] = df['interview_age'].astype(int)
age_months = df['interview_age']

age_years = []
for i in age_months:
    if i != 999:
        age_year = int(i/12)
    else:
        age_year = i
    age_years.append(age_year)
    
df['age_years'] = age_years
df = df.rename(columns={'interview_age':'age_months',
                        'src_subject_id': 'subject'})

In [693]:
to_include = []

In [694]:
# Sort df and reset index based on subject number

# df['subject'] = all_subjects
df = df.sort_values("subject")
df = df.reset_index(drop=True)

In [695]:
# Define function which takes column from other df and adds it to new df
# For all non-available values a NaN value is inserted

def add_to_df(source_df, column_name):
    
    count=0
    count2=0
    
    source_df = source_df.sort_values(column_name)
    source_df = source_df.reset_index(drop=True)
    
    new_column = []

    for iteration, subject in enumerate(df['subject']): 

        arr = np.where(source_df["src_subject_id"]==subject)[0]

        if arr.size == 0:
            new_column.append(np.nan)
            count2 += 1
            
        else:
            index = arr[0]
            new_column.append(source_df[column_name][index])
            count += 1

    df[column_name] = new_column

In [696]:
# List all columns to append to DataFrame

moca_columns = ['moca_total', 'moca_edu']

cogcomp_columns = ['nih_fluidcogcomp_unadjusted', 'nih_fluidcogcomp_ageadjusted',
       'nih_fluidcogcomp_np', 'nih_crycogcomp_unadjusted',
       'nih_crycogcomp_ageadjusted', 'nih_crystalcogcomp_np',
       'nih_eccogcomp_unadjusted', 'nih_eccogcomp_ageadjusted',
       'nih_earlychildcogcomp_np', 'nih_totalcogcomp_unadjusted',
       'nih_totalcogcomp_ageadjusted', 'nih_totalcogcomp_np']

deldisk_columns = ['auc_200', 'auc_40000']

dccs_columns = ['nih_dccs_unadjusted', 'nih_dccs_ageadjusted', 'nih_dccs_natperc']

flanker_columns = ['nih_flanker_unadjusted', 'nih_flanker_ageadjusted', 'nih_flanker_natperc']

lswmt_columns = ['uss', 'age_corrected_standard_score', 'natl_percentile__age_adjusted_']

orrt_columns = ['tbx_reading_score', 'read_acss', 'read_perc']

pcps_columns = ['nih_patterncomp_unadjusted', 'nih_patterncomp_ageadjusted', 'nih_patterncomp_natperc']

psm_columns = ['nih_picseq_unadjusted', 'nih_picseq_ageadjusted', 'nih_picseq_natperc']

tpvt_columns = ['tpvt_acss', 'tpvt_uss', 'tpvt_perc']

In [697]:
# Append all columns above to our new df using function defined above

[add_to_df(source_df=moca, column_name=column) for column in moca_columns]
[add_to_df(source_df=cogcomp, column_name=column) for column in cogcomp_columns]
[add_to_df(source_df=dccs, column_name=column) for column in dccs_columns]
[add_to_df(source_df=deldisk, column_name=column) for column in deldisk_columns]
[add_to_df(source_df=flanker, column_name=column) for column in flanker_columns]
[add_to_df(source_df=lswmt, column_name=column) for column in lswmt_columns]
[add_to_df(source_df=orrt, column_name=column) for column in orrt_columns]
[add_to_df(source_df=pcps, column_name=column) for column in pcps_columns]
[add_to_df(source_df=psm, column_name=column) for column in psm_columns]
[add_to_df(source_df=tpvt, column_name=column) for column in tpvt_columns]

[None, None, None]

In [699]:
# Rename columns in a structural way

df = df.rename(columns={
    'moca_total':'moca_total_score',
    'moca_edu':'years_of_education',
    'nih_fluidcogcomp_unadjusted':'fluidcog_unadjusted',
    'nih_fluidcogcomp_ageadjusted':'fluidcog_age',
    'nih_fluidcogcomp_np': 'fluidcog_np',
    'nih_crycogcomp_unadjusted':'crystalcog_unadjusted',
    'nih_crycogcomp_ageadjusted': 'crystalcog_age',
    'nih_crystalcogcomp_np': 'crystalcog_np',
    'nih_eccogcomp_unadjusted':'earlychildcog_unadjusted',
    'nih_eccogcomp_ageadjusted':'earlychildcog_age',
    'nih_earlychildcogcomp_np':'earlychildcog_np',
    'nih_totalcogcomp_unadjusted':'totalcog_unadjusted',
    'nih_totalcogcomp_ageadjusted':'totalcog_age',
    'nih_totalcogcomp_np':'totalcog_np',
    'nih_dccs_unadjusted':'dccs_unadjusted',
    'nih_dccs_ageadjusted':'dccs_age',
    'nih_dccs_natperc':'dccs_np',
    'auc_200':'deldisc_auc200',
    'auc_40000':'deldisc_auc40000',
    'nih_flanker_unadjusted':'flanker_unadjusted',
    'nih_flanker_ageadjusted':'flanker_age',
    'nih_flanker_natperc':'flanker_np',
    'uss':'lswmt_unadjusted',
    'age_corrected_standard_score':'lswmt_age',
    'natl_percentile__age_adjusted_':'lswmt_np',
    'tbx_reading_score':'orrt_unadjusted',
    'read_acss':'orrt_age',
    'read_perc':'orrt_np',
    'nih_patterncomp_unadjusted':'pcps_unadjusted',
    'nih_patterncomp_ageadjusted':'pcps_age',
    'nih_patterncomp_natperc':'pcps_np',
    'nih_picseq_unadjusted':'picseq_unadjusted',
    'nih_picseq_ageadjusted':'picseq_age',
    'nih_picseq_natperc':'picseq_np',
    'tpvt_acss':'pvt_age',
    'tpvt_uss':'pvt_unadjusted',
    'tpvt_perc':'pvt_np'})

In [700]:
# Check shape

df.shape

(725, 41)

In [701]:
# Check all columns

df.columns

Index(['subject', 'sex', 'age_months', 'age_years', 'moca_total_score',
       'years_of_education', 'fluidcog_unadjusted', 'fluidcog_age',
       'fluidcog_np', 'crystalcog_unadjusted', 'crystalcog_age',
       'crystalcog_np', 'earlychildcog_unadjusted', 'earlychildcog_age',
       'earlychildcog_np', 'totalcog_unadjusted', 'totalcog_age',
       'totalcog_np', 'dccs_unadjusted', 'dccs_age', 'dccs_np',
       'deldisc_auc200', 'deldisc_auc40000', 'flanker_unadjusted',
       'flanker_age', 'flanker_np', 'lswmt_unadjusted', 'lswmt_age',
       'lswmt_np', 'orrt_unadjusted', 'orrt_age', 'orrt_np', 'pcps_unadjusted',
       'pcps_age', 'pcps_np', 'picseq_unadjusted', 'picseq_age', 'picseq_np',
       'pvt_age', 'pvt_unadjusted', 'pvt_np'],
      dtype='object')

In [702]:
# Replace all 999 values which indicate missing with NaN values

df = df.replace('999', np.nan)
df = df.replace(999, np.nan)

# Show the number of available subjects per column
df.count()

subject                     725
sex                         725
age_months                  724
age_years                   724
moca_total_score            725
years_of_education          724
fluidcog_unadjusted         616
fluidcog_age                609
fluidcog_np                 580
crystalcog_unadjusted       616
crystalcog_age              607
crystalcog_np               578
earlychildcog_unadjusted    615
earlychildcog_age           608
earlychildcog_np            579
totalcog_unadjusted         615
totalcog_age                607
totalcog_np                 578
dccs_unadjusted             619
dccs_age                    611
dccs_np                     299
deldisc_auc200              719
deldisc_auc40000            719
flanker_unadjusted          619
flanker_age                 611
flanker_np                  299
lswmt_unadjusted            620
lswmt_age                   612
lswmt_np                    299
orrt_unadjusted             620
orrt_age                    611
orrt_np 

In [703]:
# Export dataframe to csv file

df.to_csv('/Users/minneschepers/Downloads/HCP_Aging_cog.csv',index=False)

In [704]:
# Test import of our exported df

df_test = pd.read_csv('/Users/minneschepers/Downloads/HCP_Aging_cog.csv')
df_test.head()

Unnamed: 0,subject,sex,age_months,age_years,moca_total_score,years_of_education,fluidcog_unadjusted,fluidcog_age,fluidcog_np,crystalcog_unadjusted,...,orrt_np,pcps_unadjusted,pcps_age,pcps_np,picseq_unadjusted,picseq_age,picseq_np,pvt_age,pvt_unadjusted,pvt_np
0,HCA6002236,F,558.0,46.0,25,18.0,111.0,111.0,77.0,111.0,...,63.0,109.0,111.0,76.0,120.0,122.0,93.0,114.0,113.0,83.0
1,HCA6010538,M,779.0,64.0,25,19.0,90.0,92.0,29.0,123.0,...,,97.0,103.0,,94.0,96.0,,126.0,126.0,
2,HCA6018857,F,436.0,36.0,27,12.0,108.0,104.0,,108.0,...,,111.0,105.0,,103.0,100.0,,112.0,110.0,
3,HCA6030645,F,544.0,45.0,25,18.0,91.0,83.0,12.0,103.0,...,,103.0,104.0,,91.0,93.0,,94.0,102.0,
4,HCA6047359,M,640.0,53.0,29,21.0,111.0,115.0,84.0,123.0,...,95.0,111.0,115.0,84.0,123.0,132.0,98.0,128.0,127.0,97.0
