In [23]:
import os
import pandas as pd
from scipy import stats
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests 
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu

pd.set_option('display.max_rows', None)  

# Set base directories
base_dir = '/u/home/n/npeterse/BPM/abcd-data-release-5.1/'
oc_work_dir = '/u/home/n/npeterse/oc_work/'

# Load the included participants list
included_participants = pd.read_csv(os.path.join(oc_work_dir, 'included_participants.csv'))
included_subjects = included_participants['src_subject_id']

# Load datasets
ABCD_ct = pd.read_csv(os.path.join(base_dir, 'core/imaging/mri_y_smr_thk_dsk.csv'))
ABCD_pds = pd.read_csv(os.path.join(base_dir, 'core/physical-health/ph_y_pds.csv'))  
ABCD_adi = pd.read_csv(os.path.join(base_dir, 'led_l_adi.csv'))
ABCD_site = pd.read_csv(os.path.join(base_dir, 'core/abcd-general/abcd_y_lt.csv'))
ABCD_hormones = pd.read_csv(os.path.join(base_dir, 'cleaned_hormone_data.csv'))
ABCD_demo = pd.read_csv(os.path.join(base_dir, 'abcd_p_demo.csv'))  # Load the demographic data

# Filter to included participants and event name for year 4 data
ABCD_ct = ABCD_ct[(ABCD_ct['src_subject_id'].isin(included_subjects)) & (ABCD_ct['eventname'] == '4_year_follow_up_y_arm_1')]
ABCD_pds = ABCD_pds[(ABCD_pds['src_subject_id'].isin(included_subjects)) & (ABCD_pds['eventname'] == '4_year_follow_up_y_arm_1')]  
ABCD_adi = ABCD_adi[ABCD_adi['src_subject_id'].isin(included_subjects)]
ABCD_site = ABCD_site[(ABCD_site['src_subject_id'].isin(included_subjects)) & (ABCD_site['eventname'] == 'baseline_year_1_arm_1')]
ABCD_hormones = ABCD_hormones[ABCD_hormones['src_subject_id'].isin(included_subjects)]
ABCD_demo = ABCD_demo[ABCD_demo['src_subject_id'].isin(included_subjects)]  # Filter demographics

# Drop duplicates based on 'src_subject_id'
ABCD_ct.drop_duplicates(subset='src_subject_id', inplace=True)
ABCD_pds.drop_duplicates(subset='src_subject_id', inplace=True)
ABCD_adi.drop_duplicates(subset='src_subject_id', inplace=True)
ABCD_site.drop_duplicates(subset='src_subject_id', inplace=True)
ABCD_demo.drop_duplicates(subset='src_subject_id', inplace=True)

# Merge datasets after duplicates are removed
merged_df = pd.merge(ABCD_ct, ABCD_pds[['src_subject_id', 'pds_y_ss_female_category_2', 'menstrualcycle4_y']], on='src_subject_id', how='left')  # Adding youth puberty stage and menstrualcycle4_y
merged_df = pd.merge(merged_df, ABCD_site[['src_subject_id', 'interview_age', 'site_id_l']], on='src_subject_id', how='left')
merged_df = pd.merge(merged_df, ABCD_adi[['src_subject_id', 'reshist_addr1_adi_perc']], on='src_subject_id', how='left')
merged_df = pd.merge(merged_df, ABCD_hormones[['src_subject_id', 'filtered_hormone_scr_dhea', 'filtered_hormone_scr_ert', 'filtered_hormone_scr_hse']], on='src_subject_id', how='left')
merged_df = pd.merge(merged_df, ABCD_demo[['src_subject_id', 'demo_comb_income_v2', 'demo_prnt_ed_v2_2yr_l']], on='src_subject_id', how='left')  # Merge demographic data

# Uncomment to inspect the updated dataframe
#print(merged_df.head())


In [24]:
# I used this code block to determine which puberty variable to use. I'm not sure why puberty categories
# are listed twice in the code book (category, category_2), but only one contained data, so I used that one. 
# you can also see from this analysis that the parent and child reports have a low-moderate correlation.

# Load datasets
ph_p_pds = pd.read_csv(os.path.join(base_dir, 'core/physical-health/ph_p_pds.csv'))
ph_y_pds = pd.read_csv(os.path.join(base_dir, 'core/physical-health/ph_y_pds.csv'))

# filter to only use relevant year 4 data from participants included in the MRI dataset
included_participants = pd.read_csv('/u/home/n/npeterse/oc_work/included_participants.csv')
included_subjects = included_participants['src_subject_id']
ph_p_pds_filtered = ph_p_pds[(ph_p_pds['src_subject_id'].isin(included_subjects)) & 
                             (ph_p_pds['eventname'] == '4_year_follow_up_y_arm_1')]
ph_y_pds_filtered = ph_y_pds[(ph_y_pds['src_subject_id'].isin(included_subjects)) & 
                             (ph_y_pds['eventname'] == '4_year_follow_up_y_arm_1')]
ph_p_pds_filtered = ph_p_pds_filtered[['src_subject_id', 'pds_p_ss_female_category', 'pds_p_ss_female_category_2']]
ph_y_pds_filtered = ph_y_pds_filtered[['src_subject_id', 'pds_y_ss_female_category', 'pds_y_ss_female_category_2']]

# Merge the two datasets on src_subject_id so we can correlate parent vs youth report
puberty_df = pd.merge(ph_p_pds_filtered, ph_y_pds_filtered, on='src_subject_id', how='inner')

# drop 'src_subject_id' so we don't accidentally try to correlate that with something
numeric_columns = puberty_df.drop(columns=['src_subject_id'])

# Descriptive statistics: counts, means, and standard deviations
desc_stats = numeric_columns.describe()

# Correlations between the puberty stage variables
correlations = numeric_columns.corr()

# Display results
#print("Descriptive Statistics (Counts, Means, SDs):")
#print(desc_stats)

# Correlations between the puberty stage variables (parent and youth reports)
correlations = numeric_columns.corr()

# Filter out NaN correlations for cleaner output
correlations_cleaned = correlations.dropna(how='all', axis=0).dropna(how='all', axis=1)

# Display results with a clear explanation
print("\nCorrelations between parent (pds_p) and youth (pds_y) puberty stage variables:")
for column in correlations_cleaned.columns:
    for index in correlations_cleaned.index:
        if pd.notna(correlations_cleaned.loc[index, column]):
            print(f"Correlation between {index} and {column}: {correlations_cleaned.loc[index, column]:.3f}")




Correlations between parent (pds_p) and youth (pds_y) puberty stage variables:
Correlation between pds_p_ss_female_category_2 and pds_p_ss_female_category_2: 1.000
Correlation between pds_y_ss_female_category_2 and pds_p_ss_female_category_2: 0.285
Correlation between pds_p_ss_female_category_2 and pds_y_ss_female_category_2: 0.285
Correlation between pds_y_ss_female_category_2 and pds_y_ss_female_category_2: 1.000


In [25]:
# selecting the brain regions where a significant group difference in thickness emerged 
ct_columns = [col for col in merged_df.columns if col.startswith('smri_thick_cdk_')]

def run_lm(df, ct_column):
    model_formula = f'{ct_column} ~ menstrualcycle4_y'
    try:
        lm_model = smf.ols(model_formula, data=df).fit()
        return lm_model
    except Exception as e:
        print(f"Error fitting model for {ct_column}: {e}")
        return None

# Loop through each cortical thickness column and run the model
significant_results = {}

for column in ct_columns:
    model_result = run_lm(merged_df, column)
    if model_result:
        p_value = model_result.pvalues.get('menstrualcycle4_y', None)  # Get the p-value for menstrualcycle4_y
        if p_value is not None and p_value < 0.05:  # Check if p-value is less than 0.05
            significant_results[column] = {
                'p_value': p_value,
                'coef': model_result.params['menstrualcycle4_y']  # Get the coefficient for menstrualcycle4_y
            }

# Print the cortical thickness measurements with p < 0.05
if significant_results:
    print("Cortical thickness measurements with p < 0.05 for menstrualcycle4_y:")
    for column, stats in significant_results.items():
        print(f"{column}: p = {stats['p_value']:.5f}")
else:
    print("No significant results with p < 0.05 for menstrualcycle4_y.")


Cortical thickness measurements with p < 0.05 for menstrualcycle4_y:
smri_thick_cdk_mobfrlh: p = 0.01738
smri_thick_cdk_paracnlh: p = 0.00031
smri_thick_cdk_ptcatelh: p = 0.01548
smri_thick_cdk_precnlh: p = 0.01417
smri_thick_cdk_sufrlh: p = 0.01897
smri_thick_cdk_supllh: p = 0.04899
smri_thick_cdk_mobfrrh: p = 0.04870
smri_thick_cdk_paracnrh: p = 0.00016
smri_thick_cdk_pcrh: p = 0.02971
smri_thick_cdk_suplrh: p = 0.01244
smri_thick_cdk_meanrh: p = 0.03567


In [26]:
# you can use this block to see how adding each covariate affects the overall model fit for the regions
# where a group difference was observed, given by AIC, BIC, however, in the end, I didn't think this was a 
# useful way to make the decision, since each value is different for each region. we can come back to this 
# and generate some kind of summary score if this method is preferable to what I've done in the supplement

covariates = ['pds_y_ss_female_category_2', 'interview_age', 'reshist_addr1_adi_perc']

# Define the function to run the linear model with covariates
def run_lm_with_covariates(df, ct_column, covariates):
    results = {}
    base_formula = f'{ct_column} ~ menstrualcycle4_y'
    base_model = smf.ols(base_formula, data=df).fit()
    results['base'] = base_model

    # Add each covariate one by one
    for covariate in covariates:
        formula_with_covariate = f'{base_formula} + {covariate}'
        model_with_covariate = smf.ols(formula_with_covariate, data=df).fit()
        results[covariate] = model_with_covariate

    return results

# Loop through significant cortical thickness columns and add covariates
covariate_results = {}

for column in significant_results.keys():
    covariate_results[column] = run_lm_with_covariates(merged_df, column, covariates)
def compare_models(results):
    comparison = []
    for model_name, model in results.items():
        comparison.append({
            'Model': model_name,
            'AIC': model.aic,
            'BIC': model.bic,
            'Adj_R2': model.rsquared_adj
        })
    return pd.DataFrame(comparison)

for column, results in covariate_results.items():
    print(f"\nModel comparisons for {column}:")
    comparison_df = compare_models(results)
    print(comparison_df)



Model comparisons for smri_thick_cdk_mobfrlh:
                        Model          AIC          BIC    Adj_R2
0                        base -1757.635470 -1747.399438  0.003775
1  pds_y_ss_female_category_2 -1724.084256 -1708.786651  0.004026
2               interview_age -1764.488079 -1749.134030  0.010093
3      reshist_addr1_adi_perc -1692.987584 -1677.757623  0.002785

Model comparisons for smri_thick_cdk_paracnlh:
                        Model          AIC          BIC    Adj_R2
0                        base -1490.711813 -1480.475780  0.009706
1  pds_y_ss_female_category_2 -1455.653185 -1440.355580  0.010631
2               interview_age -1496.302953 -1480.948904  0.014980
3      reshist_addr1_adi_perc -1441.864340 -1426.634379  0.024018

Model comparisons for smri_thick_cdk_ptcatelh:
                        Model          AIC          BIC    Adj_R2
0                        base -1712.421451 -1702.185418  0.003940
1  pds_y_ss_female_category_2 -1694.069381 -1678.771776  0.002774

In [27]:
# instead, I just looked at a whole lot of correlations. this block correlates ADI with every cortical thickness
# measurement, then generates some summary statistics (nonparametric test comparing ADI between groups).
# based on feedback from Megan, I'm now also adding summary statistics comparing household income 
# (table: abcd_p_demo, variable: demo_comb_income_v2) between groups

adi_column = 'reshist_addr1_adi_perc'
ct_columns = [col for col in merged_df.columns if col.startswith('smri_thick_cdk_')]

# Store correlation and p-value results
correlation_results = []

# Correlate ADI with each cortical thickness region
for column in ct_columns:
    if not merged_df[column].isnull().all() and not merged_df[adi_column].isnull().all():
        corr, p_value = spearmanr(merged_df[adi_column], merged_df[column], nan_policy='omit')  # Spearman's rho correlation
        correlation_results.append({'Region': column, 'Correlation': corr, 'p_value': p_value})

# Convert results to DataFrame for easier manipulation
correlation_df = pd.DataFrame(correlation_results)

# Print each region with its correlation coefficient (r) and p-value
if not correlation_df.empty:
#    print(f"Correlation between ADI and cortical thickness for each region:")
    for index, row in correlation_df.iterrows():
#        print(f"Region: {row['Region']}, Correlation (r): {row['Correlation']:.5f}, p-value: {row['p_value']:.5e}")

    # Calculate range and average of correlation coefficients and p-values
        min_corr = correlation_df['Correlation'].min()
        max_corr = correlation_df['Correlation'].max()
        avg_corr = correlation_df['Correlation'].mean()
    
        min_p_value = correlation_df['p_value'].min()
        max_p_value = correlation_df['p_value'].max()
        avg_p_value = correlation_df['p_value'].mean()

    # Print summary statistics
    print("\nSummary Statistics:")
    print(f"Range of correlation coefficients: {min_corr:.5f} to {max_corr:.5f}")
    print(f"Average correlation coefficient: {avg_corr:.5f}")
    print(f"Range of p-values: {min_p_value:.5e} to {max_p_value:.5e}")
    print(f"Average p-value: {avg_p_value:.5e}")
else:
    print("No valid correlations were computed.")

adi_column = 'reshist_addr1_adi_perc'
group_column = 'menstrualcycle4_y'

# Split ADI values into two groups based on menstrualcycle4_y (0 and 1)
group_0 = merged_df[merged_df[group_column] == 0][adi_column].dropna()
group_1 = merged_df[merged_df[group_column] == 1][adi_column].dropna()

# Perform the Mann-Whitney U test
u_stat, p_value = mannwhitneyu(group_0, group_1, alternative='two-sided')

# Print the results
print(f"ADI Mann-Whitney U test results:")
print(f"U statistic: {u_stat:.5f}")
print(f"p-value: {p_value:.5e}")

# Optionally, print descriptive statistics for each group
print("\nDescriptive statistics for ADI (reshist_addr1_adi_perc):")
print(f"Group 0 (menstrualcycle4_y = 0): Mean = {group_0.mean():.5f}, Median = {group_0.median():.5f}, N = {len(group_0)}")
print(f"Group 1 (menstrualcycle4_y = 1): Mean = {group_1.mean():.5f}, Median = {group_1.median():.5f}, N = {len(group_1)}")

# Mann-Whitney U Test for Combined Family Income
group_0_income = merged_df[merged_df[group_column] == 0][income_column].dropna()
group_1_income = merged_df[merged_df[group_column] == 1][income_column].dropna()

u_stat_income, p_value_income = mannwhitneyu(group_0_income, group_1_income, alternative='two-sided')

# Print results for Combined Family Income
print(f"\nMann-Whitney U test results for Combined Family Income (demo_comb_income_v2):")
print(f"U statistic: {u_stat_income:.5f}")
print(f"p-value: {p_value_income:.5e}")

print("\nDescriptive statistics for Combined Family Income (demo_comb_income_v2):")
print(f"Group 0 (menstrualcycle4_y = 0): Mean = {group_0_income.mean():.5f}, Median = {group_0_income.median():.5f}, N = {len(group_0_income)}")
print(f"Group 1 (menstrualcycle4_y = 1): Mean = {group_1_income.mean():.5f}, Median = {group_1_income.median():.5f}, N = {len(group_1_income)}")


Summary Statistics:
Range of correlation coefficients: -0.15456 to 0.07163
Average correlation coefficient: -0.06935
Range of p-values: 9.05139e-08 to 9.86963e-01
Average p-value: 1.17233e-01
ADI Mann-Whitney U test results:
U statistic: 33000.50000
p-value: 3.81572e-01

Descriptive statistics for ADI (reshist_addr1_adi_perc):
Group 0 (menstrualcycle4_y = 0): Mean = 39.56111, Median = 34.00000, N = 1121
Group 1 (menstrualcycle4_y = 1): Mean = 42.79365, Median = 41.00000, N = 63

Mann-Whitney U test results for Combined Family Income (demo_comb_income_v2):
U statistic: 34506.00000
p-value: 2.04970e-01

Descriptive statistics for Combined Family Income (demo_comb_income_v2):
Group 0 (menstrualcycle4_y = 0): Mean = 61.65184, Median = 8.00000, N = 1169
Group 1 (menstrualcycle4_y = 1): Mean = 49.95385, Median = 9.00000, N = 65


In [28]:
# and now we do the same thing for age

age_column = 'interview_age'
ct_columns = [col for col in merged_df.columns if col.startswith('smri_thick_cdk_')]

# Add age^2 to the dataframe
merged_df['age_squared'] = merged_df[age_column] ** 2

# Store correlation and p-value results for both age and age^2
correlation_results_age = []
correlation_results_age_squared = []

# Calculate the average age
average_age = merged_df[age_column].mean()

# Correlate age and age^2 with each cortical thickness region
for column in ct_columns:
    if not merged_df[column].isnull().all() and not merged_df[age_column].isnull().all():
        # Correlate age with cortical thickness
        corr_age, p_value_age = spearmanr(merged_df[age_column], merged_df[column], nan_policy='omit')
        correlation_results_age.append({'Region': column, 'Correlation': corr_age, 'p_value': p_value_age})

        # Correlate age^2 with cortical thickness
        corr_age_squared, p_value_age_squared = spearmanr(merged_df['age_squared'], merged_df[column], nan_policy='omit')
        correlation_results_age_squared.append({'Region': column, 'Correlation': corr_age_squared, 'p_value': p_value_age_squared})

# Convert results to DataFrame for easier manipulation
correlation_df_age = pd.DataFrame(correlation_results_age)
correlation_df_age_squared = pd.DataFrame(correlation_results_age_squared)

# Print the average age of participants
print(f"Average age of participants: {average_age:.2f} months")

# Print each region with its correlation coefficient (r) and p-value for age
if not correlation_df_age.empty:
#    print(f"\nCorrelation between age and cortical thickness for each region:")
    for index, row in correlation_df_age.iterrows():
#        print(f"Region: {row['Region']}, Correlation (r): {row['Correlation']:.5f}, p-value: {row['p_value']:.5e}")

    # Calculate range and average of correlation coefficients and p-values for age
        min_corr_age = correlation_df_age['Correlation'].min()
        max_corr_age = correlation_df_age['Correlation'].max()
        avg_corr_age = correlation_df_age['Correlation'].mean()
    
        min_p_value_age = correlation_df_age['p_value'].min()
        max_p_value_age = correlation_df_age['p_value'].max()
        avg_p_value_age = correlation_df_age['p_value'].mean()

    print("\nSummary for age:")
    print(f"Range of correlation coefficients: {min_corr_age:.5f} to {max_corr_age:.5f}")
    print(f"Average correlation coefficient: {avg_corr_age:.5f}")
    print(f"Range of p-values: {min_p_value_age:.5e} to {max_p_value_age:.5e}")
    print(f"Average p-value: {avg_p_value_age:.5e}")


Average age of participants: 120.04 months

Summary for age:
Range of correlation coefficients: -0.14520 to 0.05601
Average correlation coefficient: -0.05903
Range of p-values: 3.01386e-07 to 9.65336e-01
Average p-value: 1.62586e-01


In [29]:
# I was just assuming that age and puberty stage correlated, but here it is shown empirically. this block also
# performs a nonparametric test to see if puberty stage differs significantly between study groups (they don't)

age_column = 'interview_age'
puberty_column = 'pds_y_ss_female_category_2'
group_column = 'menstrualcycle4_y'

# Mann-Whitney U Test for Age Differences
group_0_age = merged_df[merged_df[group_column] == 0][age_column].dropna()
group_1_age = merged_df[merged_df[group_column] == 1][age_column].dropna()

u_stat_age, p_value_age = mannwhitneyu(group_0_age, group_1_age, alternative='two-sided')

# Print Mann-Whitney U test results for Age
print(f"Mann-Whitney U test for age differences between groups:")
print(f"U statistic: {u_stat_age:.5f}")
print(f"p-value: {p_value_age:.5e}")

# Print the average age for each group
print(f"\nAverage age for group 0 (menstrualcycle4_y = 0): {group_0_age.mean():.2f}")
print(f"Average age for group 1 (menstrualcycle4_y = 1): {group_1_age.mean():.2f}")

# Mann-Whitney U Test for Puberty Stage Differences
group_0_puberty = merged_df[merged_df[group_column] == 0][puberty_column].dropna()
group_1_puberty = merged_df[merged_df[group_column] == 1][puberty_column].dropna()

u_stat_puberty, p_value_puberty = mannwhitneyu(group_0_puberty, group_1_puberty, alternative='two-sided')

# Print Mann-Whitney U test results for Puberty Stage
print(f"\nMann-Whitney U test for puberty stage (pds_y_ss_female_category_2) differences between groups:")
print(f"U statistic: {u_stat_puberty:.5f}")
print(f"p-value: {p_value_puberty:.5e}")

# Print the average puberty stage for each group
print(f"\nAverage puberty stage for group 0 (menstrualcycle4_y = 0): {group_0_puberty.mean():.2f}")
print(f"Average puberty stage for group 1 (menstrualcycle4_y = 1): {group_1_puberty.mean():.2f}")

age_column = 'interview_age'
puberty_column = 'pds_y_ss_female_category_2'

# Correlate age with puberty stage
corr_age_puberty, p_value_age_puberty = spearmanr(merged_df[age_column], merged_df[puberty_column], nan_policy='omit')

# Print the correlation coefficient and p-value
print(f"Spearman correlation between age and puberty stage (pds_y_ss_female_category_2):")
print(f"Correlation (r): {corr_age_puberty:.5f}")
print(f"p-value: {p_value_age_puberty:.5e}")


Mann-Whitney U test for age differences between groups:
U statistic: 27513.50000
p-value: 1.76317e-04

Average age for group 0 (menstrualcycle4_y = 0): 119.86
Average age for group 1 (menstrualcycle4_y = 1): 123.43

Mann-Whitney U test for puberty stage (pds_y_ss_female_category_2) differences between groups:
U statistic: 32271.50000
p-value: 1.49755e-02

Average puberty stage for group 0 (menstrualcycle4_y = 0): 4.18
Average puberty stage for group 1 (menstrualcycle4_y = 1): 4.30
Spearman correlation between age and puberty stage (pds_y_ss_female_category_2):
Correlation (r): 0.16680
p-value: 5.23542e-09


In [31]:
# and finally, not used in the supplement, relationship between puberty stage and cortical thickness

puberty_column = 'pds_y_ss_female_category_2'
ct_columns = [col for col in merged_df.columns if col.startswith('smri_thick_cdk_')]

# set up a dictionary
correlation_results = []

# Correlate puberty stage with each cortical thickness region
for column in ct_columns:
    if not merged_df[column].isnull().all() and not merged_df[puberty_column].isnull().all():
        corr, p_value = spearmanr(merged_df[puberty_column], merged_df[column], nan_policy='omit')  # Spearman's rho correlation
        correlation_results.append({'Region': column, 'Correlation': corr, 'p_value': p_value})

# Convert results to DataFrame for easier manipulation
correlation_df = pd.DataFrame(correlation_results)

# Print each region with its correlation coefficient (r) and p-value
if not correlation_df.empty:
#    print(f"Correlation between puberty stage (pds_y_ss_female_category_2) and cortical thickness for each region:")
    for index, row in correlation_df.iterrows():
#        print(f"Region: {row['Region']}, Correlation (r): {row['Correlation']:.5f}, p-value: {row['p_value']:.5e}")

    # Calculate range and average of correlation coefficients and p-values
        min_corr = correlation_df['Correlation'].min()
        max_corr = correlation_df['Correlation'].max()
        avg_corr = correlation_df['Correlation'].mean()
    
        min_p_value = correlation_df['p_value'].min()
        max_p_value = correlation_df['p_value'].max()
        avg_p_value = correlation_df['p_value'].mean()

    # Print summary statistics
    print("\nSummary Statistics:")
    print(f"Range of correlation coefficients: {min_corr:.5f} to {max_corr:.5f}")
    print(f"Average correlation coefficient: {avg_corr:.5f}")
    print(f"Range of p-values: {min_p_value:.5e} to {max_p_value:.5e}")
    print(f"Average p-value: {avg_p_value:.5e}")
else:
    print("No valid correlations were computed.")



Summary Statistics:
Range of correlation coefficients: -0.07824 to 0.07814
Average correlation coefficient: 0.00652
Range of p-values: 6.45070e-03 to 9.84734e-01
Average p-value: 4.43201e-01
