In [10]:
# Code used to generate the subject measures "vars" matrix for the HPC_1200 data
# MOTE: this 'vars.txt' file WILL include all 478 SMs (leaving blanks when necessary)

# Files needed:
# 1. the unrestricted behavioral dat
# 2. the 'restricted' dataset
# 3. list of subjects in the HPC_1200 release (1003 people)
# 4. list of the SMs to be used

import numpy as np
import pandas as pd
from pandas import DataFrame
from numpy import genfromtxt
import os
import sys
from pprint import pprint

cwd = os.getcwd()
inputs = os.path.abspath("__file__"+"/../../inputs")
outputs = os.path.abspath("__file__"+"/../../outputs") # NOTE CHANGE THIS TO YOUR DESIRED OUTPUT PATH!

subject_measures_fp = os.path.join(inputs, '478_SMs.txt')
subject_ids_fp = os.path.join(inputs, 'subjectIDs.txt')
behavioral_data_fp = os.path.join(inputs, 'unrestricted_1200_release.csv')
restricted_data_fp = os.path.join(inputs, 'restricted_1200_release.csv')
# rfMRI_data_fp = os.path.join(inputs, 'rfMRI_motion.txt')
# varsQconf_fp = os.path.join(inputs, 'varsQconf.txt')


subject_measures = [line.rstrip('\n') for line in open(os.path.join(cwd,subject_measures_fp))]
subjects = [line.rstrip('\n') for line in open(os.path.join(cwd,subject_ids_fp))]

In [11]:
# Import "behavioral" and "restricted" datasets into Pandas dataframes
behavioral_data = pd.read_csv(behavioral_data_fp)
restricted_data = pd.read_csv(restricted_data_fp)

# Filter out only the rows that correspond to the subjects specified in subjects.txt
# Sanity check, making sure that the filtering occurs correctly
print('behavior shape before', behavioral_data.shape)
print('shape of restricted before', restricted_data.shape)

#filter the behavioral and restricted datasets to contain only the relevant 461 subject data
behavioral_data = behavioral_data[behavioral_data['Subject'].isin(subjects)]
restricted_data = restricted_data[restricted_data['Subject'].isin(subjects)]

print('behavior shape after', behavioral_data.shape)
print('shape of restricted after', restricted_data.shape)

behavior shape before (1206, 582)
shape of restricted before (1206, 201)
behavior shape after (1003, 582)
shape of restricted after (1003, 201)


In [12]:
# get the names of column headers
behav_headers=list(behavioral_data.columns.values)
restrict_headers=list(restricted_data.columns.values)

# convert all the column headers to lowercase
behavioral_data.columns = behavioral_data.columns.str.lower()
restricted_data.columns = restricted_data.columns.str.lower()

subject_measures = [element.lower() for element in subject_measures]

behav_headers = [element.lower() for element in behav_headers]
restrict_headers = [element.lower() for element in restrict_headers]

In [13]:
# Now let's lets get the column from the restricted and unrestricted dfs that are needed to compose vars
overlap_in_behav = np.intersect1d(subject_measures,behav_headers)
overlap_in_restrict = np.intersect1d(subject_measures,restrict_headers)

In [14]:
behavioral_data_filtered = behavioral_data[overlap_in_behav]
restricted_data_filtered = restricted_data[overlap_in_restrict]

In [15]:
behavioral_data_filtered.shape

(1003, 285)

In [16]:
restricted_data_filtered.shape

(1003, 177)

In [19]:
285+177

462

In [20]:
# it appears that we are still missing measures, where are they?
missing_in_behav = np.setdiff1d(subject_measures,behav_headers)
missing_in_restrict = np.setdiff1d(subject_measures,restrict_headers)
missing_in_behav_and_restrict = np.setdiff1d(missing_in_behav,restrict_headers)

In [21]:
# No idea where these come from, so they will just have to be blank columns in the final 'vars' matrix
missing_in_behav_and_restrict

array(['asr_aggr_pct', 'asr_attn_pct', 'asr_intr_pct', 'asr_rule_pct',
       'asr_soma_pct', 'asr_thot_pct', 'asr_witd_pct', 'dsm_adh_pct',
       'dsm_antis_pct', 'dsm_anxi_pct', 'dsm_avoid_pct', 'dsm_depr_pct',
       'dsm_somp_pct', 'fs_intercranial_vol', 'quarter/release',
       'rfmri_motion', 'sex'], dtype='<U34')

In [22]:
# concat the dataframes
# first reindex all of them to match subjects
behavioral_data_filtered.index = subjects
restricted_data_filtered.index = subjects

vars = pd.concat([behavioral_data_filtered, restricted_data_filtered], axis = 1)

In [27]:
vars.shape
vars = vars.drop(columns='subject')

In [28]:
vars = vars.reindex(columns = subject_measures)

In [29]:
vars.shape

(1003, 478)

In [30]:
vars

Unnamed: 0,subject,quarter/release,sex,age,handedness,race,rfmri_motion,ssaga_employ,ssaga_income,ssaga_educ,...,neofac_e,noise_comp,odor_unadj,odor_ageadj,paininterf_tscore,taste_unadj,taste_ageadj,mars_log_score,mars_errs,mars_final
100206,,,,26-30,65,White,,2.0,4.0,16.0,...,32.0,6.0,108.79,97.19,49.7,72.63,72.03,1.84,0.0,1.84
100307,,,,26-30,95,White,,2.0,7.0,16.0,...,37.0,3.6,101.12,86.45,38.6,71.69,71.76,1.76,0.0,1.76
100408,,,,31-35,55,White,,2.0,7.0,16.0,...,33.0,2.0,108.79,98.04,52.6,114.01,113.59,1.76,2.0,1.68
100610,,,,26-30,85,White,,2.0,6.0,16.0,...,15.0,2.0,122.25,110.45,38.6,84.84,85.31,1.92,1.0,1.88
101006,,,,31-35,90,Black or African Am.,,2.0,3.0,12.0,...,29.0,6.0,122.25,111.41,38.6,123.80,123.31,1.80,0.0,1.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992673,,,,31-35,70,White,,0.0,3.0,14.0,...,25.0,3.6,122.25,111.41,38.6,101.63,99.26,1.80,0.0,1.80
992774,,,,31-35,100,White,,0.0,3.0,12.0,...,32.0,8.4,122.25,111.41,50.1,107.17,103.55,1.76,0.0,1.76
993675,,,,26-30,85,White,,2.0,3.0,16.0,...,24.0,0.4,122.25,110.45,38.6,84.07,84.25,1.80,1.0,1.76
994273,,,,26-30,60,White,,0.0,4.0,16.0,...,27.0,6.0,122.25,111.41,63.8,110.65,109.73,1.80,1.0,1.76


In [31]:
# output vars.txt to the 'outputs' folder
vars.to_csv(os.path.join(outputs, "vars_478.txt"))