In [25]:
import pathlib 
import os
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots

In [12]:
folder_to_metadata = "/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/metadata_20250110"

In [13]:
def combine_dfs(paths: list):
    # Combines any number of csv files to a single pandas DataFrame, keeping only shared column indices. 
    for i in range(1,len(paths)):
        if i == 1: 
            joined_df = pd.read_csv(paths[i-1], header=[0], index_col=0)
            next_df = pd.read_csv(paths[i], header=[0], index_col=0)
            joined_df = pd.concat([joined_df, next_df], join="inner")  # Parameter "inner" keeps only the shared column indices.
        else:
            next_df = pd.read_csv(paths[i], header=[0], index_col=0)
            joined_df = pd.concat([joined_df, next_df], join="inner")
    return joined_df

In [14]:
path_to_metadata = [f"{folder_to_metadata}/full_data_train_valid_test.csv",
                    f"{folder_to_metadata}/meta_data_NSS_all_variables.csv",
                    f"{folder_to_metadata}/meta_data_whiteCAT_all_variables.csv"]

In [18]:
metadata_df = combine_dfs(paths=path_to_metadata)
metadata_df

Unnamed: 0_level_0,Filename,Dataset,Diagnosis,Age,Sex,Usage_original,Sex_int
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,IXI426-IOP-1011-T1,IXI,HC,41.200000,Female,training,0
1,IXI571-IOP-1154-T1,IXI,HC,56.600000,Female,training,0
2,IXI170-Guys-0843-T1,IXI,HC,50.200000,Female,training,0
3,IXI054-Guys-0707-T1,IXI,HC,60.800000,Female,training,0
4,IXI196-Guys-0805-T1,IXI,HC,47.800000,Female,training,0
...,...,...,...,...,...,...,...
157,sub-whiteCAT130_ses-01_T1w,whiteCAT,SCHZ,27.167693,Female,testing,0
158,sub-whiteCAT132_ses-01_T1w,whiteCAT,CTT,18.817248,Female,testing,0
159,sub-whiteCAT133_ses-01_T1w,whiteCAT,SCHZ,59.737166,Female,testing,0
160,sub-whiteCAT134_ses-01_T1w,whiteCAT,CTT,22.729637,Female,testing,0


In [24]:
fig = px.histogram(metadata_df[metadata_df["Age"] <= 5], x="Age")
fig.show()

In [37]:
total_fig = make_subplots(rows=4, cols=1, subplot_titles=["HC", "MDD", "SCHZ", "CTT"])
total_fig.update_layout(height=800)

conditions = ["HC", "MDD", "SCHZ", "CTT"]

for idx, cond in enumerate(conditions):
    fig = px.histogram(data_frame=metadata_df[metadata_df["Diagnosis"] == cond], 
                       x="Age", 
                       title=cond,
                       )
    total_fig.add_trace(fig.data[0], row=idx+1, col=1)
    total_fig.update_yaxes(range=[0.0, 300.0], row=idx+1, col=1)
    total_fig.update_xaxes(range=[0.0, 90.0], row=idx+1, col=1)

total_fig.show()

In [None]:
# Validation that only the NU dataset doesn't contain age information. 
metadata_df[(metadata_df["Age"]==0) & ~(metadata_df["Dataset"]=="NU")] 

Unnamed: 0_level_0,Filename,Dataset,Diagnosis,Age,Sex,Usage_original,Sex_int
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [None]:
total_fig_sexes = make_subplots(rows=1, cols=4, subplot_titles=["HC", "MDD", "SCHZ", "CTT"])
total_fig_sexes.update_layout(height=800)

conditions = ["HC", "MDD", "SCHZ", "CTT"]

for idx, cond in enumerate(conditions):
    fig = px.barplot(data_frame=metadata_df[metadata_df["Diagnosis"] == cond], 
                       x="Sex", 
                       title=cond,
                       )
    total_fig_sexes.add_trace(fig.data[0], row=idx+1, col=1)
    # total_fig_sexes.update_yaxes(range=[0.0, 300.0], row=1, col=idx+1)
    # total_fig_sexes.update_xaxes(range=[0.0, 90.0], row=1, col=idx+1)

total_fig.show()