#### **Creating a demographic table including each study stratified by sex.**

Creating the classic demographic table, Table 1 in Gagnon et al., 2025. This section will pool data from
the different studies and compute mean, std, % or count when appropriate. The output will be a
roughly formatted excel table. Final formatting should be done by hand in excel/word but the raw values
should stay the same.

**The following variables (stratified by sex) will be incorporated in the final table:**

1. N (%) 
1. Age, months (std)
1. Race/Ethnicity, count (%)
1. Highest parental education, count (%)
1. Familial Income (USD$), count (%)
1. Psychopathology, count (%)
1. Cognitive and Behavioral scores, mean (std)

In [26]:
# Imports
import os

import numpy as np
import pandas as pd

from neurostatx.io.utils import load_df_in_any_format

In [3]:
# Setting up relevant paths.
repository_path = "/Users/anthonygagnon/code/Article-s-Code/" # CHANGE THIS
abcd_base_path = "/Volumes/T7/CCPM/ABCD/Release_5.1/abcd-data-release-5.1/" # CHANGE THIS
geste_base_dir = "/Volumes/T7/CCPM/GESTE/" # CHANGE THIS
banda_dir = '/Volumes/T7/CCPM/BANDA/BANDARelease1.1/' # CHANGE THIS
output_folder = "/Volumes/T7/CCPM/RESULTS_JUNE_24/" # CHANGE THIS
data_dir = f"{output_folder}/preprocessing/"
output_dir = f"{output_folder}/demographicstable/"

# Create output directory if it does not exist.
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [30]:
# Load up ABCD data.
abcd_data = load_df_in_any_format(f'{data_dir}/abcd_data_preprocessed.xlsx')

# This next line is commented out since data is protected by a data use agreement.
#abcd_data.head()

In [8]:
# Ethnicity demographics within the ABCD data.
abcd_male = abcd_data[abcd_data.Sex == 1]
abcd_female = abcd_data[abcd_data.Sex == 2]
abcd_ethn_m = abcd_male.Ethnicity.value_counts()
abcd_ethn_f = abcd_female.Ethnicity.value_counts()

# Parental education demographics within the ABCD data.
abcd_male_edu = abcd_male.edu_groups.value_counts()
abcd_female_edu = abcd_female.edu_groups.value_counts()

# Income demographics within the ABCD data.
abcd_male_inc = abcd_male.income_groups.value_counts()
abcd_female_inc = abcd_female.income_groups.value_counts()

In [10]:
# Fetching the mean values and appending them to a list.
male_desc = [
    [abcd_male.count().iloc[0], np.round(abcd_male.count().iloc[0] * 100 / abcd_data.shape[0], 2)],
    [np.round(abcd_male.AgeMonths.mean(), 2), np.round(abcd_male.AgeMonths.std(), 2)],
    [abcd_ethn_m.loc[1], np.round(abcd_ethn_m.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_m.loc[2], np.round(abcd_ethn_m.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_m.loc[3], np.round(abcd_ethn_m.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_m.loc[4], np.round(abcd_ethn_m.loc[4] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_m.loc[5], np.round(abcd_ethn_m.loc[5] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_edu.loc[1], np.round(abcd_male_edu.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_edu.loc[2], np.round(abcd_male_edu.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_edu.loc[3], np.round(abcd_male_edu.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_edu.loc[4], np.round(abcd_male_edu.loc[4] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_edu.loc[5], np.round(abcd_male_edu.loc[5] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_inc.loc[1], np.round(abcd_male_inc.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_inc.loc[2], np.round(abcd_male_inc.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_male_inc.loc[3], np.round(abcd_male_inc.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.AD.value_counts().iloc[1], np.round(abcd_male.AD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.ADHD.value_counts().iloc[1], np.round(abcd_male.ADHD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.CD.value_counts().iloc[1], np.round(abcd_male.CD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.DD.value_counts().iloc[1], np.round(abcd_male.DD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.OCD.value_counts().iloc[1], np.round(abcd_male.OCD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_male.ODD.value_counts().iloc[1], np.round(abcd_male.ODD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [np.round(abcd_male.Internalization.mean(), 2), np.round(abcd_male.Internalization.std(), 2)],
    [np.round(abcd_male.Externalization.mean(), 2), np.round(abcd_male.Externalization.std(), 2)],
    [np.round(abcd_male.Stress.mean(), 2), np.round(abcd_male.Stress.std(), 2)],
    [np.round(abcd_male.VA.mean(), 2), np.round(abcd_male.VA.std(), 2)],
    [np.round(abcd_male.EFPS.mean(), 2), np.round(abcd_male.EFPS.std(), 2)],
    [np.round(abcd_male.MEM.mean(), 2), np.round(abcd_male.MEM.std(), 2)]
]

female_desc = [
    [abcd_female.count().iloc[0], np.round(abcd_female.count().iloc[0] * 100 / abcd_data.shape[0], 2)],
    [np.round(abcd_female.AgeMonths.mean(), 2), np.round(abcd_female.AgeMonths.std(), 2)],
    [abcd_ethn_f.loc[1], np.round(abcd_ethn_f.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_f.loc[2], np.round(abcd_ethn_f.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_f.loc[3], np.round(abcd_ethn_f.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_f.loc[4], np.round(abcd_ethn_f.loc[4] * 100 / abcd_data.shape[0], 2)],
    [abcd_ethn_f.loc[5], np.round(abcd_ethn_f.loc[5] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_edu.loc[1], np.round(abcd_female_edu.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_edu.loc[2], np.round(abcd_female_edu.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_edu.loc[3], np.round(abcd_female_edu.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_edu.loc[4], np.round(abcd_female_edu.loc[4] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_edu.loc[5], np.round(abcd_female_edu.loc[5] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_inc.loc[1], np.round(abcd_female_inc.loc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_inc.loc[2], np.round(abcd_female_inc.loc[2] * 100 / abcd_data.shape[0], 2)],
    [abcd_female_inc.loc[3], np.round(abcd_female_inc.loc[3] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.AD.value_counts().iloc[1], np.round(abcd_female.AD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.ADHD.value_counts().iloc[1], np.round(abcd_female.ADHD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.CD.value_counts().iloc[1], np.round(abcd_female.CD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.DD.value_counts().iloc[1], np.round(abcd_female.DD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.OCD.value_counts().iloc[1], np.round(abcd_female.OCD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [abcd_female.ODD.value_counts().iloc[1], np.round(abcd_female.ODD.value_counts().iloc[1] * 100 / abcd_data.shape[0], 2)],
    [np.round(abcd_female.Internalization.mean(), 2), np.round(abcd_female.Internalization.std(), 2)],
    [np.round(abcd_female.Externalization.mean(), 2), np.round(abcd_female.Externalization.std(), 2)],
    [np.round(abcd_female.Stress.mean(), 2), np.round(abcd_female.Stress.std(), 2)],
    [np.round(abcd_female.VA.mean(), 2), np.round(abcd_female.VA.std(), 2)],
    [np.round(abcd_female.EFPS.mean(), 2), np.round(abcd_female.EFPS.std(), 2)],
    [np.round(abcd_female.MEM.mean(), 2), np.round(abcd_female.MEM.std(), 2)]
]

In [29]:
# Load up BANDA data.
banda_data = load_df_in_any_format(f'{data_dir}/banda_data_preprocessed.xlsx')

# This next line is commented out since data is protected by a data use agreement.
#banda_data.head()

In [20]:
# Split into female and male, then compute statistics.
banda_male = banda_data[banda_data.Sex == 1]
banda_female = banda_data[banda_data.Sex == 2]

# Compute various stats.
banda_male_desc = [
    [banda_male.count().iloc[0], np.round(banda_male.count().iloc[0] * 100 / banda_data.shape[0], 2)],
    [np.round(banda_male.AgeMonths.mean(), 2), np.round(banda_male.AgeMonths.std(), 2)],
    [banda_male.Ethnicity.value_counts().loc[1], np.round(banda_male.Ethnicity.value_counts().loc[1] * 100 / banda_data.shape[0], 2)],
    [banda_male.Ethnicity.value_counts().loc[2], np.round(banda_male.Ethnicity.value_counts().loc[2] * 100 / banda_data.shape[0], 2)],
    [banda_male.Ethnicity.value_counts().loc[3], np.round(banda_male.Ethnicity.value_counts().loc[3] * 100 / banda_data.shape[0], 2)],
    [banda_male.Ethnicity.value_counts().loc[4], np.round(banda_male.Ethnicity.value_counts().loc[4] * 100 / banda_data.shape[0], 2)],
    [banda_male.Ethnicity.value_counts().loc[5], np.round(banda_male.Ethnicity.value_counts().loc[5] * 100 / banda_data.shape[0], 2)],
    '-',
    '-',
    [banda_male.edu_groups.value_counts().loc[4], np.round(banda_male.edu_groups.value_counts().loc[4] * 100 / banda_data.shape[0], 2)],
    [banda_male.edu_groups.value_counts().loc[5], np.round(banda_male.edu_groups.value_counts().loc[5] * 100 / banda_data.shape[0], 2)],
    [banda_male.edu_groups.value_counts().loc[6], np.round(banda_male.edu_groups.value_counts().loc[6] * 100 / banda_data.shape[0], 2)],
    '-',
    '-',
    '-',
    [banda_male.AD.value_counts().iloc[1], np.round(banda_male.AD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_male.ADHD.value_counts().iloc[1], np.round(banda_male.ADHD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    '-',
    [banda_male.DD.value_counts().iloc[1], np.round(banda_male.DD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_male.OCD.value_counts().iloc[1], np.round(banda_male.OCD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_male.ODD.value_counts().iloc[1], np.round(banda_male.ODD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [np.round(banda_male.Internalization.mean(), 2), np.round(banda_male.Internalization.std(), 2)],
    [np.round(banda_male.Externalization.mean(), 2), np.round(banda_male.Externalization.std(), 2)],
    [np.round(banda_male.Stress.mean(), 2), np.round(banda_male.Stress.std(), 2)],
    [np.round(banda_male.VA.mean(), 2), np.round(banda_male.VA.std(), 2)],
    [np.round(banda_male.EFPS.mean(), 2), np.round(banda_male.EFPS.std(), 2)],
    [np.round(banda_male.MEM.mean(), 2), np.round(banda_male.MEM.std(), 2)]
]
banda_female_desc = [
    [banda_female.count().iloc[0], np.round(banda_female.count().iloc[0] * 100 / banda_data.shape[0], 2)],
    [np.round(banda_female.AgeMonths.mean(), 2), np.round(banda_female.AgeMonths.std(), 2)],
    [banda_female.Ethnicity.value_counts().loc[1], np.round(banda_female.Ethnicity.value_counts().loc[1] * 100 / banda_data.shape[0], 2)],
    [banda_female.Ethnicity.value_counts().loc[2], np.round(banda_female.Ethnicity.value_counts().loc[2] * 100 / banda_data.shape[0], 2)],
    [banda_female.Ethnicity.value_counts().loc[3], np.round(banda_female.Ethnicity.value_counts().loc[3] * 100 / banda_data.shape[0], 2)],
    [banda_female.Ethnicity.value_counts().loc[4], np.round(banda_female.Ethnicity.value_counts().loc[4] * 100 / banda_data.shape[0], 2)],
    [banda_female.Ethnicity.value_counts().loc[5], np.round(banda_female.Ethnicity.value_counts().loc[5] * 100 / banda_data.shape[0], 2)],
    '-',
    '-',
    [banda_female.edu_groups.value_counts().loc[4], np.round(banda_female.edu_groups.value_counts().loc[4] * 100 / banda_data.shape[0], 2)],
    [banda_female.edu_groups.value_counts().loc[5], np.round(banda_female.edu_groups.value_counts().loc[5] * 100 / banda_data.shape[0], 2)],
    [banda_female.edu_groups.value_counts().loc[6], np.round(banda_female.edu_groups.value_counts().loc[6] * 100 / banda_data.shape[0], 2)],
    '-',
    '-',
    '-',
    [banda_female.AD.value_counts().iloc[1], np.round(banda_female.AD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_female.ADHD.value_counts().iloc[1], np.round(banda_female.ADHD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    '-',
    [banda_female.DD.value_counts().iloc[1], np.round(banda_female.DD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_female.OCD.value_counts().iloc[1], np.round(banda_female.OCD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [banda_female.ODD.value_counts().iloc[1], np.round(banda_female.ODD.value_counts().iloc[1] * 100 / banda_data.shape[0], 2)],
    [np.round(banda_female.Internalization.mean(), 2), np.round(banda_female.Internalization.std(), 2)],
    [np.round(banda_female.Externalization.mean(), 2), np.round(banda_female.Externalization.std(), 2)],
    [np.round(banda_female.Stress.mean(), 2), np.round(banda_female.Stress.std(), 2)],
    [np.round(banda_female.VA.mean(), 2), np.round(banda_female.VA.std(), 2)],
    [np.round(banda_female.EFPS.mean(), 2), np.round(banda_female.EFPS.std(), 2)],
    [np.round(banda_female.MEM.mean(), 2), np.round(banda_female.MEM.std(), 2)]
]

In [28]:
# Load up GESTE data.
geste_data = load_df_in_any_format(f'{data_dir}/geste_data_preprocessed.xlsx')

# This next line is commented out since data is protected by a data use agreement.
#geste_data.head()

In [25]:
geste_male = geste_data[geste_data.Sex == 1]
geste_female = geste_data[geste_data.Sex == 2]

geste_male_desc = [
    [geste_male.count().iloc[0], np.round(geste_male.count().iloc[0] * 100 / geste_data.shape[0], 2)],
    [np.round(geste_male.AgeMonths.mean(), 2), np.round(geste_male.AgeMonths.std(), 2)],
    [geste_male.Ethnicity.value_counts().loc[1], np.round(geste_male.Ethnicity.value_counts().loc[1] * 100 / geste_data.shape[0], 2)],
    [geste_male.Ethnicity.value_counts().loc[2], np.round(geste_male.Ethnicity.value_counts().loc[2] * 100 / geste_data.shape[0], 2)],
    '-',
    '-',
    [geste_male.Ethnicity.value_counts().loc[5], np.round(geste_male.Ethnicity.value_counts().loc[5] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_male.edu_groups.value_counts().loc[2], np.round(geste_male.edu_groups.value_counts().loc[2] * 100 / geste_data.shape[0], 2)],
    [geste_male.edu_groups.value_counts().loc[3] + geste_male.edu_groups.value_counts().loc[5], np.round((geste_male.edu_groups.value_counts().loc[3] + geste_male.edu_groups.value_counts().loc[5]) * 100 / geste_data.shape[0], 2)],
    [geste_male.edu_groups.value_counts().loc[4], np.round(geste_male.edu_groups.value_counts().loc[4] * 100 / geste_data.shape[0], 2)],
    [geste_male.edu_groups.value_counts().loc[6], np.round(geste_male.edu_groups.value_counts().loc[6] * 100 / geste_data.shape[0], 2)],
    [geste_male.Income_groups.value_counts().loc[1], np.round(geste_male.Income_groups.value_counts().loc[1] * 100 / geste_data.shape[0], 2)],
    [geste_male.Income_groups.value_counts().loc[2], np.round(geste_male.Income_groups.value_counts().loc[2] * 100 / geste_data.shape[0], 2)],
    [geste_male.Income_groups.value_counts().loc[3], np.round(geste_male.Income_groups.value_counts().loc[3] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_male.ADHD.value_counts().iloc[1], np.round(geste_male.ADHD.value_counts().iloc[1] * 100 / geste_data.shape[0], 2)],
    '-',
    '-',
    '-',
    '-',
    [np.round(geste_male.Internalization.mean(), 2), np.round(geste_male.Internalization.std(), 2)],
    [np.round(geste_male.Externalization.mean(), 2), np.round(geste_male.Externalization.std(), 2)],
    [np.round(geste_male.Stress.mean(), 2), np.round(geste_male.Stress.std(), 2)],
    [np.round(geste_male.VA.mean(), 2), np.round(geste_male.VA.std(), 2)],
    [np.round(geste_male.EFPS.mean(), 2), np.round(geste_male.EFPS.std(), 2)],
    [np.round(geste_male.MEM.mean(), 2), np.round(geste_male.MEM.std(), 2)]
]

geste_female_desc = [
    [geste_female.count().iloc[0], np.round(geste_female.count().iloc[0] * 100 / geste_data.shape[0], 2)],
    [np.round(geste_female.AgeMonths.mean(), 2), np.round(geste_female.AgeMonths.std(), 2)],
    [geste_female.Ethnicity.value_counts().loc[1], np.round(geste_female.Ethnicity.value_counts().loc[1] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_female.Ethnicity.value_counts().loc[3], np.round(geste_female.Ethnicity.value_counts().loc[3] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_female.Ethnicity.value_counts().loc[5], np.round(geste_female.Ethnicity.value_counts().loc[5] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_female.edu_groups.value_counts().loc[2], np.round(geste_female.edu_groups.value_counts().loc[2] * 100 / geste_data.shape[0], 2)],
    [geste_female.edu_groups.value_counts().loc[3] + geste_female.edu_groups.value_counts().loc[5], np.round((geste_female.edu_groups.value_counts().loc[3] + geste_female.edu_groups.value_counts().loc[5]) * 100 / geste_data.shape[0], 2)],
    [geste_female.edu_groups.value_counts().loc[4], np.round(geste_female.edu_groups.value_counts().loc[4] * 100 / geste_data.shape[0], 2)],
    [geste_female.edu_groups.value_counts().loc[6], np.round(geste_female.edu_groups.value_counts().loc[6] * 100 / geste_data.shape[0], 2)],
    [geste_female.Income_groups.value_counts().loc[1], np.round(geste_female.Income_groups.value_counts().loc[1] * 100 / geste_data.shape[0], 2)],
    [geste_female.Income_groups.value_counts().loc[2], np.round(geste_female.Income_groups.value_counts().loc[2] * 100 / geste_data.shape[0], 2)],
    [geste_female.Income_groups.value_counts().loc[3], np.round(geste_female.Income_groups.value_counts().loc[3] * 100 / geste_data.shape[0], 2)],
    '-',
    [geste_female.ADHD.value_counts().iloc[1], np.round(geste_female.ADHD.value_counts().iloc[1] * 100 / geste_data.shape[0], 2)],
    '-',
    '-',
    '-',
    '-',
    [np.round(geste_female.Internalization.mean(), 2), np.round(geste_female.Internalization.std(), 2)],
    [np.round(geste_female.Externalization.mean(), 2), np.round(geste_female.Externalization.std(), 2)],
    [np.round(geste_female.Stress.mean(), 2), np.round(geste_female.Stress.std(), 2)],
    [np.round(geste_female.VA.mean(), 2), np.round(geste_female.VA.std(), 2)],
    [np.round(geste_female.EFPS.mean(), 2), np.round(geste_female.EFPS.std(), 2)],
    [np.round(geste_female.MEM.mean(), 2), np.round(geste_female.MEM.std(), 2)]
]

In [27]:
# Merging everything together into a single demographics table.
demo_table = pd.DataFrame({
    "abcd_male": male_desc,
    "abcd_female": female_desc,
    "banda_male": banda_male_desc,
    "banda_female": banda_female_desc,
    "geste_male": geste_male_desc,
    "geste_female": geste_female_desc
})
demo_table.index = [
    "N", "Age (months)", "White", "Black or African American", "Hispanic or Latino", "Asian", "Other",
    "No Highschool", "Highschool, GED, or equivalent", "Some college", "Bachelor Degree", "Postgraduate Degree",
    "Income < 50 000$USD", "Income 50 000-100 000$USD", "Income > 100 000$USD", 
    "AD (%)", "ADHD (%)", "CD (%)", "DD (%)", "OCD (%)", "ODD (%)",
    "Internalization (mean)", "Externalization (mean)", "Stress (mean)", "VA (mean)", "EFPS (mean)", "MEM (mean)"
]
demo_table.to_excel(f'{output_dir}/demo_table.xlsx', index=True, header=True)