In [92]:
import pandas as pd
import numpy as np
import nibabel as nib
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [93]:
# ── Global visualisation configuration ──────────────────────────────────────

# 1.  General Matplotlib defaults
# ── Global visualisation configuration ──────────────────────────────────────
import matplotlib as mpl
import seaborn as sns

mpl.rcParams.update(
    {
        # ── Canvas size & resolution ───────────────────────────────────────────
        # Default figure size: 12×8 inches  →  4800×3200 px when exported at 400 dpi
        "figure.figsize": (12, 8),
        "figure.dpi": 200,  # crisp in-notebook / retina preview
        "savefig.dpi": 400,  # print-quality PNG/PDF
        # ── Fonts ──────────────────────────────────────────────────────────────
        "font.family": "sans-serif",
        "font.sans-serif": ["Roboto", "DejaVu Sans", "Arial"],
        "axes.titlesize": 24,
        # "axes.titleweight": "bold",
        "axes.labelsize": 24,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
        "legend.fontsize": 20,
        # ── Axis & spine aesthetics ────────────────────────────────────────────
        "axes.spines.top": False,
        "axes.spines.right": False,
        "axes.spines.left": True,
        "axes.spines.bottom": True,
        "axes.linewidth": 1,
        "axes.grid": True,
        "grid.color": "#E6E6E6",
        "grid.linewidth": 0.4,
        "grid.alpha": 0.8,
        # ── Colour cycle (colour-blind-safe) ───────────────────────────────────
        "axes.prop_cycle": mpl.cycler(color=sns.color_palette("Set2")),
        # ── Figure background ─────────────────────────────────────────────────
        "figure.facecolor": "white",
    }
)

# Seaborn theme inherits the rcParams above
sns.set_theme(context="talk", style="whitegrid", palette="Set2")


# 2.  Seaborn theme (inherits Matplotlib rcParams)
sns.set_theme(
    context="talk",  # slightly larger fonts for presentations / papers
    style="whitegrid",  # grid only on y-axis (good for histograms)
    palette="Set2",  # matches the rcParams colour cycle
)


# 3.  Helper function for consistent figure export
def savefig_nice(fig, filename, *, tight=True, dpi=300, **savefig_kwargs):
    """Save figure with tight layout and correct DPI."""
    if tight:
        fig.tight_layout()
    fig.savefig(filename, dpi=dpi, bbox_inches="tight", transparent=True, **savefig_kwargs)


# 4.  Colour constants for this project (optional convenience)
COL_RAW = "#1f77b4"  # e.g. unweighted sample
COL_WEIGHTED = "#d62728"  # weighted sample
COL_REF = "0.35"  # census reference (neutral grey)

In [94]:
ATLAS = "schaefer2018tian2020_400_7"
region_col = "index"
# Load important files
DATA_DIR = Path("/home/galkepler/Projects/athletes_brain/data")

# Output directory for figures
OUTPUT_DIR = Path("/home/galkepler/Projects/athletes_brain/figures/fig1")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load the data
parcels = pd.read_csv(DATA_DIR / "external" / "atlases" / ATLAS / "parcels.csv", index_col=0)
nifti = DATA_DIR / "external" / "atlases" / ATLAS / "atlas.nii.gz"
nifti_matlab = DATA_DIR / "external" / "atlases" / ATLAS / "atlas_matlab.nii"

In [95]:
metrics = ["gm_vol", "wm_vol", "csf_vol","adc","fa","ad","rd"]
distribution_metric = "qfmean"

bad_subjects = ["IN120120"]

# Load the data
data = {}
for metric in metrics:
    data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
    # drop problematic subjects
    data[metric] = data[metric][~data[metric]["subject_code"].isin(bad_subjects)]
    data[metric]["sex"] = data[metric]["sex"].map({"M": 0, "F": 1})



# data["age_squared"] = data["age_at_scan"] ** 2

  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)
  data[metric] = pd.read_csv(DATA_DIR / "processed" / f"{metric}.csv",index_col=0).reset_index(drop=True)


In [96]:
metric_cols = {
    metric: "volume" if "vol" in metric else distribution_metric for metric in metrics
}

for m, df in data.items():
    df = df.rename(columns={metric_cols[m]: "value"})
    data[m] = df

In [97]:
# --- 1. Initial Setup and Imports ---

# Standard libraries
import os
import pandas as pd
import numpy as np

# Statistical analysis
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning (for potential future steps like classification)
# from sklearn.model_selection import train_test_split, cross_val_score # Not directly used in this section
# from sklearn.svm import SVC # Not directly used in this section
# from sklearn.metrics import classification_report, confusion_matrix # Not directly used in this section

# For multiple comparisons correction (will be used later)
from statsmodels.stats.multitest import multipletests

In [98]:
# --- 3. Univariate Group Comparisons (ANCOVA) - Adapted for Dictionary ---

print("\n--- Starting Univariate Group Comparisons (ANCOVA) ---")

# Define your group labels from the 'group' column
control_group_label = "Control"
climber_group_label = "Climber"
bjj_group_label = "BJJ"

# Store results for all comparisons
athletes_vs_naive_results = {}

# Loop through each MRI metric type in your dictionary
for metric_type, df_metric in data.items():
    print(f"\n--- Analyzing Metric: {metric_type} ---")

    # Define covariates for the current metric. TIV is only for volumetric measures.
    current_covariates = ["age_at_scan", "sex"]
    if "vol" in metric_type and "tiv" in df_metric.columns:
        current_covariates.append("tiv")

    # --- Comparison 1: Athlete (True) vs. Control (False) using 'target' column ---
    print(f"  Performing Athlete vs. Control comparison for {metric_type}...")
    # temp_df_target = df_metric[["target"] + current_covariates].copy()
    temp_df_target = df_metric.copy()  # Use the full DataFrame for region columns
    # temp_df_target.dropna(inplace=True)
    temp_df_target["target"] = temp_df_target["target"].astype(
        str
    )  # Ensure 'target' is treated as categorical

    if temp_df_target.empty or len(temp_df_target["target"].unique()) < 2:
        print(f"    Skipping Athlete vs. Control for {metric_type}: Not enough data or groups.")
    else:
        metric_stats = parcels.copy()
        for i, row in metric_stats.iterrows():
            cur_df = temp_df_target[temp_df_target[region_col] == row[region_col]]
            formula_parts = ["value ~ C(target)"] + current_covariates
            formula = " + ".join(formula_parts)
            # break
            try:
                model = ols(formula, data=cur_df).fit()
                aov_table = sm.stats.anova_lm(model, typ=2)
                group_p_value = aov_table.loc["C(target)", "PR(>F)"]
                group_f_statistic = aov_table.loc["C(target)", "F"]

                # Raw means for plotting
                group_means = cur_df.groupby("target")["value"].mean().to_dict()
                metric_stats.loc[i, "F_statistic"] = group_f_statistic
                metric_stats.loc[i, "p_value"] = group_p_value
                metric_stats.loc[i, "True_Mean"] = group_means.get("True", np.nan)
                metric_stats.loc[i, "False_Mean"] = group_means.get("False", np.nan)
                metric_stats.loc[i, "coefficient"] = model.params.get("C(target)[T.True]", np.nan)
                metric_stats.loc[i, "std_err"] = model.bse.get("C(target)[T.True]", np.nan)
                metric_stats.loc[i, "t_statistic"] = model.tvalues.get("C(target)[T.True]", np.nan)

            except Exception as e:
                # print(f"    Error processing {metric_type} - {region_col} (Athlete vs Control): {e}")
                # Append NaN for failed models to maintain order for p-value correction
                raise e
            metric_stats["adjusted_p_value"] = multipletests(
                metric_stats["p_value"], method="fdr_bh"
            )[1]
            athletes_vs_naive_results[metric_type] = metric_stats


--- Starting Univariate Group Comparisons (ANCOVA) ---

--- Analyzing Metric: gm_vol ---
  Performing Athlete vs. Control comparison for gm_vol...

--- Analyzing Metric: wm_vol ---
  Performing Athlete vs. Control comparison for wm_vol...

--- Analyzing Metric: csf_vol ---
  Performing Athlete vs. Control comparison for csf_vol...


  F /= J



--- Analyzing Metric: adc ---
  Performing Athlete vs. Control comparison for adc...

--- Analyzing Metric: fa ---
  Performing Athlete vs. Control comparison for fa...

--- Analyzing Metric: ad ---
  Performing Athlete vs. Control comparison for ad...

--- Analyzing Metric: rd ---
  Performing Athlete vs. Control comparison for rd...


In [99]:
# --- 3. Univariate Group Comparisons (ANCOVA) - Adapted for Dictionary ---

print("\n--- Starting Univariate Group Comparisons (ANCOVA) ---")

# Define your group labels from the 'group' column
control_group_label = "Control"
climber_group_label = "Climbing"
bjj_group_label = "Bjj"

# Store results for all comparisons
climbers_vs_bjj_results = {}

# Loop through each MRI metric type in your dictionary
for metric_type, df_metric in data.items():
    print(f"\n--- Analyzing Metric: {metric_type} ---")

    # Define covariates for the current metric. TIV is only for volumetric measures.
    current_covariates = ["age_at_scan", "sex"]
    if "vol" in metric_type and "tiv" in df_metric.columns:
        current_covariates.append("tiv")

    # --- Comparison 1: Athlete (True) vs. Control (False) using 'target' column ---
    print(f"  Performing {climber_group_label} vs. {bjj_group_label} comparison for {metric_type}...")
    # temp_df_target = df_metric[["target"] + current_covariates].copy()
    temp_df_target = df_metric[df_metric["target"] & df_metric["group"].isin([climber_group_label,bjj_group_label])].copy()  # Use the full DataFrame for region columns
    # temp_df_target.dropna(inplace=True)
    temp_df_target["group"] = temp_df_target["group"].astype(str)  # Ensure 'group' is treated as categorical

    if temp_df_target.empty or len(temp_df_target["group"].unique()) < 2:
        print(f"    Skipping Athlete vs. Control for {metric_type}: Not enough data or groups.")
    else:
        metric_stats = parcels.copy()
        for i, row in metric_stats.iterrows():
            cur_df = temp_df_target[temp_df_target[region_col] == row[region_col]]
            # in the formula, make sure the reference group is "Climbing"
            formula_parts = ["value ~ C(group, Treatment(reference='Climbing'))"] + current_covariates
            formula = " + ".join(formula_parts)
            # break
            try:
                model = ols(formula, data=cur_df).fit()
                aov_table = sm.stats.anova_lm(model, typ=2)
                group_p_value = aov_table.loc[
                    "C(group, Treatment(reference='Climbing'))", "PR(>F)"
                ]
                group_f_statistic = aov_table.loc["C(group, Treatment(reference='Climbing'))", "F"]

                # Raw means for plotting
                group_means = cur_df.groupby("group")["value"].mean().to_dict()
                metric_stats.loc[i, "F_statistic"] = group_f_statistic
                metric_stats.loc[i, "p_value"] = group_p_value
                metric_stats.loc[i, "climber_Mean"] = group_means.get(climber_group_label, np.nan)
                metric_stats.loc[i, "bjj_Mean"] = group_means.get(bjj_group_label, np.nan)
                metric_stats.loc[i, "coefficient"] = model.params.get(
                    "C(group, Treatment(reference='Climbing'))[T.Bjj]", np.nan
                )
                metric_stats.loc[i, "std_err"] = model.bse.get(
                    "C(group, Treatment(reference='Climbing'))[T.Bjj]", np.nan
                )
                metric_stats.loc[i, "t_statistic"] = model.tvalues.get(
                    "C(group, Treatment(reference='Climbing'))[T.Bjj]", np.nan
                )

            except Exception as e:
                # print(f"    Error processing {metric_type} - {region_col} (Athlete vs Control): {e}")
                # Append NaN for failed models to maintain order for p-value correction
                raise e
            metric_stats["adjusted_p_value"] = multipletests(
                metric_stats["p_value"], method="fdr_bh"
            )[1]
            climbers_vs_bjj_results[metric_type] = metric_stats
    # break


--- Starting Univariate Group Comparisons (ANCOVA) ---

--- Analyzing Metric: gm_vol ---
  Performing Climbing vs. Bjj comparison for gm_vol...

--- Analyzing Metric: wm_vol ---
  Performing Climbing vs. Bjj comparison for wm_vol...

--- Analyzing Metric: csf_vol ---
  Performing Climbing vs. Bjj comparison for csf_vol...


  F /= J



--- Analyzing Metric: adc ---
  Performing Climbing vs. Bjj comparison for adc...

--- Analyzing Metric: fa ---
  Performing Climbing vs. Bjj comparison for fa...

--- Analyzing Metric: ad ---
  Performing Climbing vs. Bjj comparison for ad...

--- Analyzing Metric: rd ---
  Performing Climbing vs. Bjj comparison for rd...


In [103]:
athletes_vs_naive_results["gm_vol"].sort_values(by=["p_value"])

Unnamed: 0,index,name,base_name,Label Name,network,component,hemisphere,F_statistic,p_value,True_Mean,False_Mean,coefficient,std_err,t_statistic,adjusted_p_value
61,62,7Networks_LH_SomMot_31,7networks_lh_sommot,7Networks_LH_SomMot,somatomotor,somatomotor,L,13.927372,0.000197,376.550788,350.848214,18.350156,4.917056,3.731940,0.053622
268,269,7Networks_RH_SomMot_39,7networks_rh_sommot,7Networks_RH_SomMot,somatomotor,somatomotor,R,13.587396,0.000236,682.926689,628.759089,37.000331,10.037776,3.686108,0.053622
447,448,lAMY-lh,lAMY,"Amygdala, lateral nucleus",subcortex,Amygdala,L,11.970876,0.000556,1007.734574,954.052760,22.701342,6.561280,3.459895,0.084183
422,423,THA-DP-rh,THA-DP,"Thalamus, dorso-posterior part",subcortex,Thalamus,R,9.753954,0.001825,877.630167,836.721772,27.892002,8.930779,3.123132,0.207169
262,263,7Networks_RH_SomMot_33,7networks_rh_sommot,7Networks_RH_SomMot,somatomotor,somatomotor,R,8.023007,0.004683,384.050325,360.732872,15.335571,5.414163,2.832491,0.311026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,287,7Networks_RH_DorsAttn_Post_17,7networks_rh_dorsattn_post,7Networks_RH_DorsAttn_Post,dorsal attention,posterior,R,0.000233,0.987821,1793.487135,1743.588830,-0.328597,21.522707,-0.015267,0.995519
193,194,7Networks_LH_Default_pCunPCC_5,7networks_lh_default_pcunpcc,7Networks_LH_Default_pCunPCC,default,precuneus posterior cingulate cortex,L,0.000165,0.989764,1051.303169,1017.800130,0.202331,15.768884,0.012831,0.995519
398,399,7Networks_RH_Default_pCunPCC_8,7networks_rh_default_pcunpcc,7Networks_RH_Default_pCunPCC,default,precuneus posterior cingulate cortex,R,0.000066,0.993536,707.296795,684.295119,-0.077928,9.616747,-0.008103,0.995519
362,363,7Networks_RH_Default_Par_2,7networks_rh_default_par,7Networks_RH_Default_Par,default,parietal,R,0.000044,0.994720,1957.817965,1888.177012,-0.158181,23.898424,-0.006619,0.995519


In [104]:
import numpy as np
import pandas as pd
import nibabel as nb
from nilearn import surface, datasets
from surfplot import Plot
from matplotlib.colors import TwoSlopeNorm  # nice diverging colours
from neuromaps.datasets import fetch_fslr
from brainspace.datasets import load_parcellation


atlas_img = nib.load(nifti_matlab)

# ---------------------------------------------------------------------
# 2.  FETCH A STANDARD SURFACE  (fsaverage5 = 10k vertices per hemi)
# ---------------------------------------------------------------------
surfaces = fetch_fslr()
lh, rh = surfaces["veryinflated"]


# ---------------------------------------------------------------------
# 3.  SAMPLE ATLAS VOXELS → SURFACE  (nearest-neighbour so labels stay int)
# ---------------------------------------------------------------------
# add schaefer parcellation (no color bar needed)
lh_parc, rh_parc = load_parcellation("schaefer")

In [105]:
# ---------------------------------------------------------------------
# 4.  MAP REGION IDs → METRIC VALUES
#     vertices with label 0 (background) → NaN so they render transparent
# ---------------------------------------------------------------------
p_threshold = 0.05
value_threshold = 0

value_map = {}
value_map_subcortex = {"region": [], "value": [], "Hemisphere": []}

key = "gm_vol"  # or any other metric you want to visualize


vis_df = athletes_vs_naive_results[key].copy()
p_column = "adjusted_p_value"
value_column = "t_statistic"

for i, row in vis_df.iterrows():
    label = row[region_col]
    if (row[p_column] < p_threshold) and (np.abs(row[value_column]) > value_threshold):
        value = row[value_column]
    else:
        value = np.nan
    if "schaefer" in ATLAS:
        if label > int(ATLAS.split("_")[1]):
            value_map_subcortex["region"].append(label)
            value_map_subcortex["value"].append(value)
            value_map_subcortex["Hemisphere"].append(row["hemisphere"])
        else:
            value_map[label] = value

vec = np.vectorize(lambda x: value_map.get(x, np.nan))
data_lh = vec(lh_parc)
data_rh = vec(rh_parc)

In [None]:
vmin = -3
vmax = 3
# vmin = -0.1
# vmax = 0.1

for hemi, hemi_data, key, vis_data in zip(
    ["L", "R"], [lh, rh], ["left", "right"], [data_lh, data_rh]
):
    # ---------------------------------------------------------------------
    # 5.  BUILD THE PLOT
    # ---------------------------------------------------------------------
    p = Plot(
        hemi_data,
        # rh,
        # views="latmed",  # L-lat, L-med, R-lat, R-med
        size=(800, 300),  # px; change as needed
        zoom=1.6,
        layout="row",
        mirror_views=True,
        views=["medial", "lateral"],
    )

    # ---- main data layer -------------------------------------------------
    # If your metric is centred on 0 (e.g., t-values) use TwoSlopeNorm for symmetry
    # norm = TwoSlopeNorm(
    #     vmin=np.nanmin([data_lh, data_rh]), ,vmax=np.nanmax([data_lh, data_rh])
    # )

    p.add_layer(
        {"left": data_lh, "right": data_rh},
        cmap="coolwarm",
        color_range=(vmin, vmax),
        # color_range=(2.5,3.5),
        cbar_label=value_column,
    )
    # ---- outline layer ---------------------------------------------------
    # Re-use the label arrays; surfplot draws borders when as_outline=True
    p.add_layer({"left": lh_parc, "right": rh_parc}, cmap="gray", as_outline=True, cbar=False)

    # fig = p.build(cbar_kws=dict(location="bottom", decimals=2))
    fig = p.build()

    # save figure
    # savefig_nice(fig, OUTPUT_DIR / f"fig2_{metric}_{hemi}_{value_column}.png", dpi=400)