#### **Creating a demographic table for the 2-year and 4-year follow-up in the ABCD study stratified by sex.**

Creating the classic demographic table for each follow-up in the ABCD study. This section will pool data from
the follow-ups and compute mean, std, % or count when appropriate. The output will be a
roughly formatted excel table. Final formatting should be done by hand in excel/word but the raw values
should stay the same.

**The following variables (stratified by sex) will be incorporated in the final table:**

1. N (%) 
1. Age, months (std)
1. Race/Ethnicity, count (%)
1. Highest parental education, count (%)
1. Familial Income (USD$), count (%)
1. Cognitive and Behavioral scores, mean (std)

In [2]:
# Imports
import os

import numpy as np
import pandas as pd

from neurostatx.io.utils import load_df_in_any_format

In [4]:
# Setting up relevant paths.
repository_path = "/Users/anthonygagnon/code/Article-s-Code/" # CHANGE THIS
abcd_base_path = "/Volumes/T7/CCPM/ABCD/Release_5.1/abcd-data-release-5.1/" # CHANGE THIS
geste_base_dir = "/Volumes/T7/CCPM/GESTE/" # CHANGE THIS
banda_dir = '/Volumes/T7/CCPM/BANDA/BANDARelease1.1/' # CHANGE THIS
output_folder = "/Volumes/T7/CCPM/RESULTS_JUNE_24/" # CHANGE THIS
data_dir = f"{output_folder}/LongitudinalProfiles/datapreprocessing"
output_dir = f"{output_folder}/LongitudinalProfiles/demographics"

# Create output directory if it does not exist.
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [14]:
# Load up ABCD data.
abcd_2y = load_df_in_any_format(f'{data_dir}/abcd_2y_preprocessed.xlsx')


In [15]:
# Ethnicity demographics within the ABCD data.
abcd_2y_male = abcd_2y[abcd_2y.Sex == 1]
abcd_2y_female = abcd_2y[abcd_2y.Sex == 2]
abcd_2y_ethn_m = abcd_2y_male.Ethnicity.value_counts()
abcd_2y_ethn_f = abcd_2y_female.Ethnicity.value_counts()

# Parental education demographics within the ABCD data.
abcd_2y_male_edu = abcd_2y_male.ParentalEducation.value_counts()
abcd_2y_female_edu = abcd_2y_female.ParentalEducation.value_counts()

# Income demographics within the ABCD data.
abcd_2y_male_inc = abcd_2y_male.IncomeGroups.value_counts()
abcd_2y_female_inc = abcd_2y_female.IncomeGroups.value_counts()

In [16]:
# Fetching the mean values and appending them to a list.
male_2y_desc = [
    [abcd_2y_male.count().iloc[0], np.round(abcd_2y_male.count().iloc[0] * 100 / abcd_2y.shape[0], 2)],
    [np.round(abcd_2y_male.AgeMonths.mean(), 2), np.round(abcd_2y_male.AgeMonths.std(), 2)],
    [abcd_2y_ethn_m.loc[1], np.round(abcd_2y_ethn_m.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_m.loc[2], np.round(abcd_2y_ethn_m.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_m.loc[3], np.round(abcd_2y_ethn_m.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_m.loc[4], np.round(abcd_2y_ethn_m.loc[4] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_m.loc[5], np.round(abcd_2y_ethn_m.loc[5] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_edu.loc[1], np.round(abcd_2y_male_edu.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_edu.loc[2], np.round(abcd_2y_male_edu.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_edu.loc[3], np.round(abcd_2y_male_edu.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_edu.loc[4], np.round(abcd_2y_male_edu.loc[4] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_edu.loc[5], np.round(abcd_2y_male_edu.loc[5] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_inc.loc[1], np.round(abcd_2y_male_inc.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_inc.loc[2], np.round(abcd_2y_male_inc.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_male_inc.loc[3], np.round(abcd_2y_male_inc.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [np.round(abcd_2y_male.Internalizing.mean(), 2), np.round(abcd_2y_male.Internalizing.std(), 2)],
    [np.round(abcd_2y_male.Externalizing.mean(), 2), np.round(abcd_2y_male.Externalizing.std(), 2)],
    [np.round(abcd_2y_male.Stress.mean(), 2), np.round(abcd_2y_male.Stress.std(), 2)],
    [np.round(abcd_2y_male.VA.mean(), 2), np.round(abcd_2y_male.VA.std(), 2)],
    [np.round(abcd_2y_male.EFPS.mean(), 2), np.round(abcd_2y_male.EFPS.std(), 2)],
    [np.round(abcd_2y_male.MEM.mean(), 2), np.round(abcd_2y_male.MEM.std(), 2)]
]

female_2y_desc = [
    [abcd_2y_female.count().iloc[0], np.round(abcd_2y_female.count().iloc[0] * 100 / abcd_2y.shape[0], 2)],
    [np.round(abcd_2y_female.AgeMonths.mean(), 2), np.round(abcd_2y_female.AgeMonths.std(), 2)],
    [abcd_2y_ethn_f.loc[1], np.round(abcd_2y_ethn_f.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_f.loc[2], np.round(abcd_2y_ethn_f.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_f.loc[3], np.round(abcd_2y_ethn_f.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_f.loc[4], np.round(abcd_2y_ethn_f.loc[4] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_ethn_f.loc[5], np.round(abcd_2y_ethn_f.loc[5] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_edu.loc[1], np.round(abcd_2y_female_edu.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_edu.loc[2], np.round(abcd_2y_female_edu.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_edu.loc[3], np.round(abcd_2y_female_edu.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_edu.loc[4], np.round(abcd_2y_female_edu.loc[4] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_edu.loc[5], np.round(abcd_2y_female_edu.loc[5] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_inc.loc[1], np.round(abcd_2y_female_inc.loc[1] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_inc.loc[2], np.round(abcd_2y_female_inc.loc[2] * 100 / abcd_2y.shape[0], 2)],
    [abcd_2y_female_inc.loc[3], np.round(abcd_2y_female_inc.loc[3] * 100 / abcd_2y.shape[0], 2)],
    [np.round(abcd_2y_female.Internalizing.mean(), 2), np.round(abcd_2y_female.Internalizing.std(), 2)],
    [np.round(abcd_2y_female.Externalizing.mean(), 2), np.round(abcd_2y_female.Externalizing.std(), 2)],
    [np.round(abcd_2y_female.Stress.mean(), 2), np.round(abcd_2y_female.Stress.std(), 2)],
    [np.round(abcd_2y_female.VA.mean(), 2), np.round(abcd_2y_female.VA.std(), 2)],
    [np.round(abcd_2y_female.EFPS.mean(), 2), np.round(abcd_2y_female.EFPS.std(), 2)],
    [np.round(abcd_2y_female.MEM.mean(), 2), np.round(abcd_2y_female.MEM.std(), 2)]
]

In [17]:
# Load up ABCD data.
abcd_4y = load_df_in_any_format(f'{data_dir}/abcd_4y_preprocessed.xlsx')

In [18]:
# Ethnicity demographics within the ABCD data.
abcd_4y_male = abcd_4y[abcd_4y.Sex == 1]
abcd_4y_female = abcd_4y[abcd_4y.Sex == 2]
abcd_4y_ethn_m = abcd_4y_male.Ethnicity.value_counts()
abcd_4y_ethn_f = abcd_4y_female.Ethnicity.value_counts()

# Parental education demographics within the ABCD data.
abcd_4y_male_edu = abcd_4y_male.ParentalEducation.value_counts()
abcd_4y_female_edu = abcd_4y_female.ParentalEducation.value_counts()

# Income demographics within the ABCD data.
abcd_4y_male_inc = abcd_4y_male.IncomeGroups.value_counts()
abcd_4y_female_inc = abcd_4y_female.IncomeGroups.value_counts()

In [19]:
# Fetching the mean values and appending them to a list.
male_4y_desc = [
    [abcd_4y_male.count().iloc[0], np.round(abcd_4y_male.count().iloc[0] * 100 / abcd_4y.shape[0], 2)],
    [np.round(abcd_4y_male.AgeMonths.mean(), 2), np.round(abcd_4y_male.AgeMonths.std(), 2)],
    [abcd_4y_ethn_m.loc[1], np.round(abcd_4y_ethn_m.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_m.loc[2], np.round(abcd_4y_ethn_m.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_m.loc[3], np.round(abcd_4y_ethn_m.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_m.loc[4], np.round(abcd_4y_ethn_m.loc[4] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_m.loc[5], np.round(abcd_4y_ethn_m.loc[5] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_edu.loc[1], np.round(abcd_4y_male_edu.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_edu.loc[2], np.round(abcd_4y_male_edu.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_edu.loc[3], np.round(abcd_4y_male_edu.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_edu.loc[4], np.round(abcd_4y_male_edu.loc[4] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_edu.loc[5], np.round(abcd_4y_male_edu.loc[5] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_inc.loc[1], np.round(abcd_4y_male_inc.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_inc.loc[2], np.round(abcd_4y_male_inc.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_male_inc.loc[3], np.round(abcd_4y_male_inc.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [np.round(abcd_4y_male.Internalizing.mean(), 2), np.round(abcd_4y_male.Internalizing.std(), 2)],
    [np.round(abcd_4y_male.Externalizing.mean(), 2), np.round(abcd_4y_male.Externalizing.std(), 2)],
    [np.round(abcd_4y_male.Stress.mean(), 2), np.round(abcd_4y_male.Stress.std(), 2)],
    [np.round(abcd_4y_male.VA.mean(), 2), np.round(abcd_4y_male.VA.std(), 2)],
    [np.round(abcd_4y_male.EFPS.mean(), 2), np.round(abcd_4y_male.EFPS.std(), 2)],
    [np.round(abcd_4y_male.MEM.mean(), 2), np.round(abcd_4y_male.MEM.std(), 2)]
]

female_4y_desc = [
    [abcd_4y_female.count().iloc[0], np.round(abcd_4y_female.count().iloc[0] * 100 / abcd_4y.shape[0], 2)],
    [np.round(abcd_4y_female.AgeMonths.mean(), 2), np.round(abcd_4y_female.AgeMonths.std(), 2)],
    [abcd_4y_ethn_f.loc[1], np.round(abcd_4y_ethn_f.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_f.loc[2], np.round(abcd_4y_ethn_f.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_f.loc[3], np.round(abcd_4y_ethn_f.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_f.loc[4], np.round(abcd_4y_ethn_f.loc[4] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_ethn_f.loc[5], np.round(abcd_4y_ethn_f.loc[5] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_edu.loc[1], np.round(abcd_4y_female_edu.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_edu.loc[2], np.round(abcd_4y_female_edu.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_edu.loc[3], np.round(abcd_4y_female_edu.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_edu.loc[4], np.round(abcd_4y_female_edu.loc[4] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_edu.loc[5], np.round(abcd_4y_female_edu.loc[5] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_inc.loc[1], np.round(abcd_4y_female_inc.loc[1] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_inc.loc[2], np.round(abcd_4y_female_inc.loc[2] * 100 / abcd_4y.shape[0], 2)],
    [abcd_4y_female_inc.loc[3], np.round(abcd_4y_female_inc.loc[3] * 100 / abcd_4y.shape[0], 2)],
    [np.round(abcd_4y_female.Internalizing.mean(), 2), np.round(abcd_4y_female.Internalizing.std(), 2)],
    [np.round(abcd_4y_female.Externalizing.mean(), 2), np.round(abcd_4y_female.Externalizing.std(), 2)],
    [np.round(abcd_4y_female.Stress.mean(), 2), np.round(abcd_4y_female.Stress.std(), 2)],
    [np.round(abcd_4y_female.VA.mean(), 2), np.round(abcd_4y_female.VA.std(), 2)],
    [np.round(abcd_4y_female.EFPS.mean(), 2), np.round(abcd_4y_female.EFPS.std(), 2)],
    [np.round(abcd_4y_female.MEM.mean(), 2), np.round(abcd_4y_female.MEM.std(), 2)]
]

In [21]:
# Merging everything together into a single demographics table.
demo_table = pd.DataFrame({
    "abcd_2y_male": male_2y_desc,
    "abcd_2y_female": female_2y_desc,
    "abcd_4y_male": male_4y_desc,
    "abcd_4y_female": female_4y_desc
})
demo_table.index = [
    "N", "Age (months)", "White", "Black or African American", "Hispanic or Latino", "Asian", "Other",
    "No Highschool", "Highschool, GED, or equivalent", "Some college", "Bachelor Degree", "Postgraduate Degree",
    "Income < 50 000$USD", "Income 50 000-100 000$USD", "Income > 100 000$USD",
    "Internalization (mean)", "Externalization (mean)", "Stress (mean)", "VA (mean)", "EFPS (mean)", "MEM (mean)"
]
demo_table.to_excel(f'{output_dir}/demo_table_followups.xlsx', index=True, header=True)