In [1]:
"""
Generate descriptive SNBB figures that match the slide template colours.
--------------------------------------------------------------------------

Prereqs:
    pip install matplotlib pandas numpy seaborn
    (seaborn only used for its convenient styling helper)

Replace CSV_PATH if your file lives elsewhere.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib import font_manager as fm

In [2]:
SOFTWARE = "mrtrix3"
ATLAS = "schaefer2018tian2020_400_7"
RECONSTRUCTION = "SDStream"
SCALE = "None"
WEIGHT = "SIFT2"


DISTRIBUTION_METRIC = "qfmean"
DIFFUSION_MEASURES = ["adc", "fa", "rd", "ad"]
STRUCTURAL_MEASURES = ["gm_vol", "wm_vol", "csf_vol"]
CONNECTOME_MEASURES = [
    "degree",
    "eigenvector_centrality",
    "betweenness_centrality",
    "degree_norm",
    "regional_efficiency",
]

In [3]:
sessions = pd.read_csv("~/Projects/plasticityhub/sessions.csv")
sessions["subject_code"] = (
    sessions["subject_code"]
    .astype(str)
    .str.replace("-", "")
    .str.replace(" ", "")
    .str.replace("_", "")
    .str.zfill(4)
)
sessions.loc[sessions["filled"], "weekly_workouts"] = sessions.loc[
    sessions["filled"], "weekly_workouts"
].fillna(0)

# sessions.loc[sessions["filled"], "weekly_workouts"].fillna(0, inplace=True)
# sessions["weekly_workouts"] = sessions["weekly_workouts"].fillna(0)
sessions["weekly_workouts"] = sessions["weekly_workouts"].replace({"10 שעות ויותר": 10})
sessions.loc[sessions["weekly_workouts"].notna(), "weekly_workouts"] = sessions.loc[
    sessions["weekly_workouts"].notna(), "weekly_workouts"
].astype(int)
sessions.loc[~sessions["filled"], "weekly_workouts"] = np.nan

In [4]:
sessions

Unnamed: 0,subject_code,subject_id,dob,age_at_scan,sex,session_id,study,group,condition,path,...,pet_duration,home_cooking_times_per_week,recreational_activity_times_per_week,working_with_computer_times_per_week,screen_time,situational_phobia,social_phobia,animal_based_phobia,medical_phobia,total_phobia
0,BAL01,312350267,1994-04-25,24.55,F,201811111233,Plasticity,Balance,Control,,...,,,,,,,,,,
1,B001,319123659,1998-04-16,20.57,M,201811111343,Brainbank,Ya_Bb,,,...,,,,,,,,,,
2,BAL01,312350267,1994-04-25,24.55,F,201811111445,Plasticity,Balance,Control,,...,,,,,,,,,,
3,BAL02,307336487,1986-09-25,32.13,M,201811121221,Plasticity,Balance,Learning,,...,,,,,,,,,,
4,BAL02,307336487,1986-09-25,32.13,M,201811121449,Plasticity,Balance,Learning,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4195,BB01502,327557161,2005-03-24,20.49,M,202509191219,Brain Bank,Snbb,,/mnt/62/Raw_Data/20250919_1219,...,3-5 years,0.0,1.0,10.0,1.0,0.0,0.0,2.0,0.0,2.0
4196,YA1297,325840346,2003-12-31,21.72,M,202509191336,Plasticity,Music,Professional,/mnt/62/Raw_Data/20250919_1336,...,5-10 years,1.0,1.0,11.0,2.0,0.0,0.0,0.0,0.0,0.0
4197,GYML35,200459691,1988-02-01,37.64,M,202509211603,Gymnastics,Gymnastics,Learning,,...,,1.0,3.0,7.0,5.0,1.0,2.0,0.0,0.0,3.0
4198,GYML31,301242020,1987-09-13,38.02,F,202509211828,Gymnastics,Gymnastics,Learning,,...,,,,,,,,,,


In [5]:
sessions["weekly_workouts"].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0])

0.25     0.0
0.50     0.0
0.75     2.0
0.90     4.0
0.95     5.0
0.99     7.0
1.00    14.0
Name: weekly_workouts, dtype: object

In [6]:
MAX_NUMBER_OF_WORKOUTS = 0

# write a query to detect atheletes (study == "sport", condition != "learning")
ATHLETES_QUERY = "(condition in ['Professional', 'Control']) & (group in ['Climbing', 'Bjj'])"
LEARNERS_QUERY = "(condition in ['Learning']) & (group in ['Climbing', 'Bjj'])"

target_subjects = sessions.query(ATHLETES_QUERY)["subject_code"].unique()
learner_subjects = sessions.query(LEARNERS_QUERY)["subject_code"].unique()

# locate subjects that have more than MAX_NUMBER_OF_WORKOUTS
subjects_to_drop = sessions[
    (~sessions["subject_code"].isin(target_subjects))
    & (~sessions["subject_code"].isin(learner_subjects))
    & (
        (sessions["weekly_workouts"] > MAX_NUMBER_OF_WORKOUTS)
        | (sessions["weekly_workouts"].isna())
    )
]["subject_code"].unique()
valid_learners = sessions[
    (sessions["subject_code"].isin(learner_subjects))
    & (sessions["weekly_workouts"] <= MAX_NUMBER_OF_WORKOUTS)
]["subject_code"].unique()

print(f"Target subject (N): {len(target_subjects)}")
print(f"Valid learners (N): {len(valid_learners)}")
print(f"Subjects to drop (N): {len(subjects_to_drop)}")

Target subject (N): 151
Valid learners (N): 113
Subjects to drop (N): 1386


In [7]:
# sessions[~sessions["subject_code"].isin(target_subjects) & (sessions["subject_code"].isin(learner_subjects)) & (sessions["weekly_workouts"] > MAX_NUMBER_OF_WORKOUTS)].shape
# len(subjects_to_drop)

In [8]:
source = Path("/media/storage/phd//athletes_brain/data/interim")
destination = Path("/media/storage/phd//athletes_brain/data/processed")

# unique_measures = DIFFUSION_MEASURES + STRUCTURAL_MEASURES + CONNECTOME_MEASURES
unique_measures = DIFFUSION_MEASURES + STRUCTURAL_MEASURES


results = {m: {} for m in unique_measures}

for m in unique_measures:
    print(f"Processing {m}")

    data = pd.read_csv(source / f"{m}.csv", index_col=0)
    data["subject_code"] = (
        data["subject_code"]
        .astype(str)
        .str.replace("-", "")
        .str.replace(" ", "")
        .str.replace("_", "")
        .str.zfill(4)
    )
    # data = data.drop_duplicates(subset=["subject_code", "index"], keep="last")
    # tag athletes
    data["target"] = data["subject_code"].isin(target_subjects)
    data["learner"] = data["subject_code"].isin(learner_subjects)
    data["cs_valid"] = True
    data.loc[
        (data["learner"]) & (~data["target"]) & (~data["subject_code"].isin(valid_learners)),
        "cs_valid",
    ] = False
    # break
    # drop learners that have more than one session
    # learners = data[data["condition"].isin(["Learning", "Novice Control"]) & (data["study"] == "Sport")]
    # learners = learners[learners.duplicated(subset=["subject_code"], keep=False)]
    # data = data.drop(learners.index)
    # data = data.drop(
    #     data[
    #         (data["condition"].isin(["Learning", "Novice Control"])) & (data["study"] == "Sport")
    #     ].index
    # )
    # data = data.drop(data[(data["target"]) & (~data["group"].isin(["Climbing","Bjj"]))].index)
    # data = data.drop(
    #     data[(data["exercise_frequency"] > MAX_NUMBER_OF_WORKOUTS) & (~data["target"])].index
    # )
    # drop "subjects_to_drop"
    # data = data[~data["subject_code"].isin(subjects_to_drop)]
    data.loc[data["subject_code"].isin(subjects_to_drop), "cs_valid"] = False
    # data[data["exercise_frequency"] < MAX_NUMBER_OF_WORKOUTS, "target"] = False
    data.to_csv(destination / f"{m}.csv")
    print(f"Found {data['subject_code'].nunique()} subjects with {m} data")

Processing adc


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2778 subjects with adc data
Processing fa


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2778 subjects with fa data
Processing rd


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2778 subjects with rd data
Processing ad


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2778 subjects with ad data
Processing gm_vol


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2779 subjects with gm_vol data
Processing wm_vol


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2779 subjects with wm_vol data
Processing csf_vol


  data = pd.read_csv(source / f"{m}.csv", index_col=0)


Found 2779 subjects with csf_vol data


In [9]:
# data["target"] = data["subject_code"].isin(target_subjects)
# data["learner"] = data["subject_code"].isin(learner_subjects)
# data["valid"] = True
# data.loc[data["learner"] & (~data["subject_code"].isin(valid_learners)), "valid"] = False
data.drop_duplicates(subset=["subject_code", "index"], keep="last")["target"].value_counts(
    normalize=True
)
# data["index"].unique()

target
False    0.949622
True     0.050378
Name: proportion, dtype: float64

In [10]:
# data["target"] = data["subject_code"].isin(target_subjects)
# data["learner"] = data["subject_code"].isin(learner_subjects)
# data["valid"] = True
# data.loc[data["learner"] & (~data["subject_code"].isin(valid_learners)), "valid"] = False
data[data["cs_valid"]].drop_duplicates(subset=["subject_code", "index"], keep="last")[
    "target"
].value_counts(normalize=True)
# data["index"].unique()

target
False    0.910141
True     0.089859
Name: proportion, dtype: float64

In [11]:
d = data.drop_duplicates(subset=["session_id"], keep="first")

In [12]:
# d[d["learner"]].drop_duplicates(subset="subject_code")["valid"].value_counts()
d.drop_duplicates(subset="subject_code")["cs_valid"].value_counts()

cs_valid
True     1556
False    1220
Name: count, dtype: int64