In [None]:
import os
import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import colorsys
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import graphviz
from statistics import mean, median, mode, stdev
import scipy.stats as stats

from utils import rubin_combine

plt.rcParams['figure.constrained_layout.use'] = False
SEED = 7355608

predictors = ['base_phq9', 'gad7_sum', 'age', 'sds_sum', 'alc_sum', 'gender', 'education', 'working', 'marital_status', 'race_is_latino', 'race_is_black', 'race_is_asian', 'race_is_multiracial_or_other', 'income_satisfaction']

In [None]:
# MAIN ANALYSIS
data_dir = "results/main"

In [None]:
# SENSITIVITY ANALYSIS: 12-week PHQ-9 instead of 4-week PHQ-9
# UNCOMMENT BELOW TO RUN
# data_dir = "results/sens_12wk"

In [None]:
# SENSITIVITY ANALYSIS: exclude participants with minimal baseline data
# UNCOMMENT BELOW TO RUN
# data_dir = "results/sens_exclude_min"

In [None]:
pt_data = pd.read_csv(os.path.join(data_dir, "miceRanger_imputed_formatted_Brighten-v1_all.csv"))

In [None]:
'''Generate decision trees for interpretation, using two strategies: 
    1. Fit a decision tree to the entire dataset, concatenating all imputed versions
    2. Fit a decision tree separately to each imputed version of the dataset (focusing on a variable of interest), and examine the distribution
'''

def man_cmap(cmap, value=1.):
    colors = cmap(np.arange(cmap.N))
    hls = np.array([colorsys.rgb_to_hls(*c) for c in colors[:,:3]])
    hls[:,1] *= value
    rgb = np.clip(np.array([colorsys.hls_to_rgb(*c) for c in hls]), 0,1)
    return mcolors.LinearSegmentedColormap.from_list("", rgb)

for outcome in ["mdd_improve"]:
    
    print("OUTCOME: " + outcome)
    
    x = pt_data[predictors].copy()
        
    clf_all = tree.DecisionTreeClassifier(max_depth=2)
    clf_all = clf_all.fit(x, pt_data[outcome])
    clf_data = tree.export_graphviz(clf_all, out_file=None, feature_names=predictors, class_names=["no-" + outcome, outcome])
    graph = graphviz.Source(clf_data)
    graph.render("decision_tree_" + outcome)
    
    #http://www.futurile.net/2016/02/27/matplotlib-beautiful-plots-with-style/
    plt.style.use('ggplot')
    txt_col = 'k'
    plt.rcParams['text.color'] = txt_col
    plt.rcParams['axes.labelcolor'] = txt_col
    plt.rcParams['xtick.color'] = txt_col
    plt.rcParams['ytick.color'] = txt_col
    plt.rcParams['axes.labelsize'] = 16
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['xtick.labelsize'] = 16
    plt.rcParams['ytick.labelsize'] = 16
    plt.rcParams['legend.fontsize'] = 16
    plt.rcParams['figure.titlesize'] = 24
    plt.tight_layout()

    feature_of_interest = "gad7_sum"
    verbose=False
    thresholds = []
    for imp in pt_data["_Imputation_"].unique():
        pt_data_imp = pt_data[pt_data["_Imputation_"] == imp]

        x = pt_data_imp[[feature_of_interest]].copy()
            
        clf_all = tree.DecisionTreeClassifier(max_depth=1)
        clf_all = clf_all.fit(x, pt_data_imp[outcome])

        # Get feature and threshold used by the tree
        feature = clf_all.tree_.feature[0]  # Get feature index used at root node
        threshold = clf_all.tree_.threshold[0]  # Get threshold value used at root node
        thresholds.append(threshold)

        # Get class distributions for left and right nodes to determine relationship direction
        left_dist = clf_all.tree_.value[1]  # Class distribution when feature <= threshold
        right_dist = clf_all.tree_.value[2]  # Class distribution when feature > threshold
        left_prob = left_dist[0][1] / left_dist[0].sum()  # Probability of {outcome}=True when <= threshold
        right_prob = right_dist[0][1] / right_dist[0].sum()  # Probability of {outcome}=True when > threshold

        if verbose:
            direction = "more" if right_prob > left_prob else "less"
            print(f"\nThreshold: {feature_of_interest} = {threshold:.2f}")
            print(f"Values above threshold are {direction} likely to have {outcome}=True")
    
    print(f"** Statistics for threshold values ({feature_of_interest}) among {pt_data["_Imputation_"].nunique()} imputations: **")
    print(f"Mean threshold: {mean(thresholds)}")
    print(f"Median threshold: {median(thresholds)}")
    print(f"Mode threshold: {mode(thresholds)}")
    print(f"Standard deviation of thresholds: {stdev(thresholds)}")
    print(f"Minimum threshold: {min(thresholds)}")
    print(f"Maximum threshold: {max(thresholds)}")
    print(f"Unique thresholds: {np.unique(thresholds)}")
    print(f"Proportion of thresholds equal to median={mode(thresholds)}: {np.mean(np.array(thresholds) == median(thresholds))}")


In [None]:
'''Generate odds ratios for MDD improvement (using a single-variable threshold of interest), pooling across imputed versions of the dataset using Rubin's rules. 
'''

n_impute=100

for arm in ["all", "HealthTips", "EVO", "iPST"]:
    for group in ["all"]:
        print("STUDY ARM:", arm)

        pt_data = pd.read_csv(os.path.join(data_dir, f"miceRanger_imputed_formatted_Brighten-v1_{group}.csv"))

        if arm != "all":
            pt_data = pt_data[pt_data[f"study_arm_{arm}"] == 1]

        pt_data = pt_data[["_Imputation_", "gad7_sum", "mdd_improve"]]

        #pt_data["finish"] = pt_data["bddybocs_tot_recalc_post"].notnull()
        pt_data["gad7_below_11"] = pt_data["gad7_sum"] < 11
        pt_data["gad7_11_or_higher"] = pt_data["gad7_sum"] >= 11

        for cond in ["gad7_below_11", "gad7_11_or_higher"]:
            print("Condition: " + cond)
            for outcome in ["mdd_improve"]:
                print("----OUTCOME: " + outcome + "----")

                or_vals = []
                or_vars = []
                for imp in pt_data["_Imputation_"].unique():
                    pt_data_imp = pt_data[pt_data["_Imputation_"] == imp]
                    cont_table = np.array([
                        [len(pt_data_imp[pt_data_imp[cond] & pt_data_imp[outcome]]), len(pt_data_imp[~pt_data_imp[cond] & pt_data_imp[outcome]])],
                        [len(pt_data_imp[pt_data_imp[cond] & ~pt_data_imp[outcome]]), len(pt_data_imp[~pt_data_imp[cond] & ~pt_data_imp[outcome]])]
                    ])

                    # If any cell of the contingency table is zero, add 0.5 to all cells. 
                    if np.any(cont_table == 0):
                        cont_table = cont_table + 0.5

                    # Within-imputation value estimate
                    or_vals.append((cont_table[0,0]/cont_table[0,1]) / (cont_table[1,0]/cont_table[1,1]))

                    # Within-imputation variance
                    or_vars.append(sum([1/val for val in cont_table.reshape(-1)]))

                pooled_or, lower_ci, upper_ci, ors, pval = rubin_combine(or_vals, or_vars, log_normal=True)
                                                                
                print("RUBIN OddsR:", round(pooled_or, 4), "[", round(lower_ci, 4), ",", round(upper_ci, 4), "].")
                print("avg OddsR:", round(sum(ors)/len(ors), 4))
                print("p-value:", pval)
                print("based on ", len(ors), " odds ratios")
                
            print("---------")
    print("==============================")

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import os
from utils import rubin_combine

def analyze_correlation_with_rubin_combine(df, var1, var2):
    """
    Calculates and pools Pearson and Spearman correlations using the existing rubin_combine function.

    This function performs the following steps:
    1. For each imputed dataset, calculates the Pearson and Spearman correlation coefficients.
    2. Applies the Fisher Z-transformation to each coefficient (arctanh).
    3. Calculates the within-imputation variance for each transformed coefficient.
    4. Uses the pre-existing `rubin_combine` function to pool the Z-transformed values.
    5. Transforms the pooled Z-score and its confidence interval back to the correlation scale (tanh).
    6. Prints the final pooled correlation, confidence interval, and p-value.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing all imputed datasets, with an '_Imputation_' column.
    var1 (str): The name of the first variable's column.
    var2 (str): The name of the second variable's column.
    """
    pearson_zs = []
    pearson_vars = []
    spearman_zs = []
    spearman_vars = []
    sample_sizes = []

    imputations = df['_Imputation_'].unique()
    m = len(imputations)

    for imp in imputations:
        df_imp = df[df['_Imputation_'] == imp]
        complete_cases = df_imp[[var1, var2]].dropna()
        n = len(complete_cases)
        assert len(complete_cases) == len(df_imp), "There shouldn't be any missing data in the imputed versions of the dataset"
        sample_sizes.append(n)

        # Variance for Fisher's Z-transform is 1/(n-3), so we need at least 4 samples.
        if n < 4:
            print("Warning: Not enough complete cases to calculate Z-transformedcorrelation.")
            continue

        # --- Pearson Correlation ---
        pearson_r, _ = stats.pearsonr(complete_cases[var1], complete_cases[var2])
        pearson_zs.append(np.arctanh(pearson_r))
        pearson_vars.append(1 / (n - 3))

        # --- Spearman Correlation ---
        spearman_r, _ = stats.spearmanr(complete_cases[var1], complete_cases[var2])
        spearman_zs.append(np.arctanh(spearman_r))
        spearman_vars.append(1 / (n - 3))

    print(f"Analyzing correlation between '{var1}' and '{var2}' across {m} imputations.")
    print(f"Average sample size per imputation (after dropping NA): {np.mean(sample_sizes):.1f}\n")

    # --- Pool Pearson Results ---
    if pearson_zs:
        # We set log_normal=False because Fisher's Z-scores are normally distributed.
        pooled_z_p, lower_z_p, upper_z_p, _, p_val_p = rubin_combine(pearson_zs, pearson_vars, log_normal=False)
        
        print("--- Pooled Pearson Correlation ---")
        # Transform the pooled Z-score and its CI back to the correlation scale
        print(f"Pooled Correlation (r): {np.tanh(pooled_z_p):.5f}")
        print(f"95% Confidence Interval: [{np.tanh(lower_z_p):.5f}, {np.tanh(upper_z_p):.5f}]")
        print(f"P-value: {p_val_p:.5f}")
    else:
        print("Could not calculate Pearson correlation due to insufficient data.")

    print("")

    # --- Pool Spearman Results ---
    if spearman_zs:
        pooled_z_s, lower_z_s, upper_z_s, _, p_val_s = rubin_combine(spearman_zs, spearman_vars, log_normal=False)
        
        print("--- Pooled Spearman Correlation ---")
        # Transform the pooled Z-score and its CI back to the correlation scale
        print(f"Pooled Correlation (ρ): {np.tanh(pooled_z_s):.5f}")
        print(f"95% Confidence Interval: [{np.tanh(lower_z_s):.5f}, {np.tanh(upper_z_s):.5f}]")
        print(f"P-value: {p_val_s:.5f}")
    else:
        print("Could not calculate Spearman correlation due to insufficient data.")

    print("-----\nNote: Spearman correlation is more appropriate for ordinal data.")

pt_data = pd.read_csv(os.path.join(data_dir, "miceRanger_imputed_formatted_Brighten-v1_all.csv"))

analyze_correlation_with_rubin_combine(pt_data, 'gad7_sum', 'sds_sum')

In [None]:
## SANITY CHECK: correlation of entire concatenated multiply-imputed dataset (should be similar to Rubin version)

import pandas as pd
import numpy as np
from scipy import stats

def analyze_gad7_sds_correlation(df):
    """
    Calculate correlation between GAD-7 and SDS scores, handling missing values.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing 'gad7_sum' and 'sds_sum' columns
    
    Returns:
    dict: Dictionary containing correlation statistics and sample sizes
    """
    # Remove rows where either GAD-7 or SDS is missing
    complete_cases = df.dropna(subset=['gad7_sum', 'sds_sum'])
    
    # Calculate number of complete and missing cases
    n_complete = len(complete_cases)
    n_total = len(df)
    n_missing_gad7 = df['gad7_sum'].isna().sum()
    n_missing_sds = df['sds_sum'].isna().sum()
    
    # Calculate correlations if we have at least 2 complete cases
    if n_complete >= 2:
        pearson_r, pearson_p = stats.pearsonr(
            complete_cases['gad7_sum'], 
            complete_cases['sds_sum']
        )
        spearman_r, spearman_p = stats.spearmanr(
            complete_cases['gad7_sum'], 
            complete_cases['sds_sum']
        )
    else:
        pearson_r = pearson_p = spearman_r = spearman_p = np.nan
    
    results = {
        'n_total': n_total,
        'n_complete': n_complete,
        'n_missing_gad7': n_missing_gad7,
        'n_missing_sds': n_missing_sds,
        'pearson_r': pearson_r,
        'pearson_p': pearson_p,
        'spearman_r': spearman_r,
        'spearman_p': spearman_p
    }

    # Print results
    print(f"Complete cases: {results['n_complete']} out of {results['n_total']}")
    print(f"Missing GAD-7: {results['n_missing_gad7']}")
    print(f"Missing SDS: {results['n_missing_sds']}")
    print(f"Pearson correlation: r = {results['pearson_r']:.3f}, p = {results['pearson_p']:.3f}")
    print(f"Spearman correlation: ρ = {results['spearman_r']:.3f}, p = {results['spearman_p']:.3f}")

    return results


pt_data = pd.read_csv(os.path.join(data_dir, "miceRanger_imputed_formatted_Brighten-v1_all.csv"))

results = analyze_gad7_sds_correlation(pt_data)