In [10]:
import pandas as pd
import pathlib
import ipytest
ipytest.autoconfig()
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
import sys 
sys.path.append("/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data/")
sys.path.append("/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data_t/")
import numpy as np

In [11]:
path_to_xml = "/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data/"
path_to_xml_t = "/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data_t/"

paths = list(pathlib.Path(path_to_xml).glob("*.csv"))
paths_t = list(pathlib.Path(path_to_xml_t).glob("*.csv"))


In [12]:
def plot_distribution(paths: list, measurement: str):
    total_fig = make_subplots(rows=len(paths), cols=1)
    total_fig_data = []

    for path in paths: 
        df = pd.read_csv(path, index_col=[0])
        df = df.loc[df['Volume'] == measurement]
        rois = df.columns[1:]   
        
        fig = go.Figure()

        for roi in rois:
            fig.add_trace(go.Box(y=df[roi], name=roi))

        fig.update_layout(boxmode='group') #to group boxes of the same type.
        fig.update_traces(boxpoints='all', jitter=.3)
        
        total_fig_data.append(fig)
    
    for idx, fig in enumerate(total_fig_data):
        for trace in fig.data:
            total_fig.add_trace(trace, row=idx+1, col=1)
    
    total_fig.update_layout(height=1200)
    total_fig.show()

plot_distribution(paths_t, measurement="Vgm")


In [13]:
path = "/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data_t/Aggregated_suit_t.csv"

df = pd.read_csv(path, index_col=[0])
rois = df.columns[2:]

fig = make_subplots(rows=1, cols=len(rois))
for i, roi in enumerate(rois):
    fig.add_trace(
    go.Box(y=df[roi], name=roi), row=1, col=i+1)

fig.update_traces(boxpoints='all', jitter=.3)
fig.update_layout()

In [14]:
def analyze_df(df: pd.DataFrame) -> dict: 
    data_dict = {}
    rois = df.columns[1:]
    for roi in rois:
        roi_data = df[roi]
        
        data_dict[roi] = {
            "mean":roi_data.mean(),
            "median":roi_data.median(),
            "quantiles": roi_data.quantile(q=[0.25, 0.5, 0.75]).to_list(),
            "min": roi_data.min(), 
            "max": roi_data.max(),
            "var": roi_data.var()
        }
        
    return data_dict

def analyze_features(paths: list):
    all_data = {}
    for path in paths_t: 
        path_str = str(path)
        df = pd.read_csv(path, index_col=[0]).copy()
        all_data[path_str] = {}

        for measurement in ["Vgm", "Vwm"]:
            df_m = df.loc[df['Volume'] == measurement].copy()
            
            if df_m.shape[0] == 0:
                continue

            data_dict = analyze_df(df_m)
            all_data[path_str][measurement] = data_dict
    return all_data

In [15]:
all_data = analyze_features(paths=paths_t)

atlas = "suit"
measurement = "Vgm"

path = f"/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data_t/Aggregated_{atlas}_t.csv"

rois = [roi for roi in all_data[path][measurement]]
variances = [all_data[path][measurement][roi]["var"] for roi in rois]

sort_index = np.argsort(variances)
sorted_rois = np.array(rois)[sort_index].tolist()
print(f"Variances: {variances}")
print(f"ROIs: {rois}")
print(f"Sorted ROIs (ascending): {sorted_rois}")

Variances: [0.07415033891386619, 0.0051135801710090805, 0.2033034992976461, 0.00982183085930438, 0.47657264938041105, 0.0007869548466094154, 0.12837874484198744, 0.003011577537953975, 1.960948880166815e-06, 0.5688185428455882, 0.055180145744071915, 0.0011531533936690623, 0.024900710056218977, 0.011529476001349532, 1.7280330555198054e-06, 0.03178970355642277, 0.041009108221522025, 0.01153491037995968, 0.021014910775747464, 0.09719599083564273, 0.0010406121860175555, 0.01167237129490256, 0.0664321720735913, 0.0017230048156997944, 0.05921800162576032, 0.0007711174948790572, 6.736678881600844e-05, 0.00012582634955519457]
ROIs: ['Left I IV', 'Right I IV', 'Left V', 'Right V', 'Left VI', 'Vermis VI', 'Right VI', 'Left CrusI', 'Vermis CrusI', 'Right CrusI', 'Left CrusII', 'Vermis CrusII', 'Right CrusII', 'Left VIIb', 'Vermis VIIb', 'Right VIIb', 'Left VIIIa', 'Vermis VIIIa', 'Right VIIIa', 'Left VIIIb', 'Vermis VIIIb', 'Right VIIIb', 'Left IX', 'Vermis IX', 'Right IX', 'Left X', 'Vermis X', '

In [None]:
import pandas as pd
import numpy as np 

def normalize_and_scale_df(df: pd.DataFrame) -> pd.DataFrame:
    # Normalizes the columns (patient volumes) by Min-Max Scaling and scales the rows (ROIs) with Z-transformation.

    df_copy = df.copy()
    column_sums = df_copy.sum()
    
    # Apply the formula: ln((10000*value)/sum_values + 1) "Log transformation"
    # Alternatively for Min-Max Scaling: df_copy/df_copy.max() - Problem: Some rows have std = 0
    transformed_df = np.log((10000 * df_copy) / column_sums + 1)
    
    norm_copy = transformed_df.copy()

    cols = norm_copy.columns.get_level_values(-1).tolist()
    unique_cols = list(set(cols))
    
    if len(unique_cols) > 0:
        for col_type in unique_cols:
            cols_to_scale = [col for col in norm_copy.columns if col[-1] == col_type] 
            print(cols_to_scale)
            print(norm_copy[cols_to_scale])
            print(norm_copy.apply(lambda x: pd.Series((x-x.mean())/x.std()) if x.std() > 0 else pd.Series([0]*len(x)), axis="columns").head)
    # else:
    #     norm_copy = norm_copy.apply(z_scale, axis="columns")
    return norm_copy

def z_scale(row):
    sd = row.std()
    mean = row.mean()
    if sd > 0: 
        return pd.Series((row-mean)/sd)
    else:
        return pd.Series([0]*len(row))



path_to_csv = "/net/data.isilon/ag-cherrmann/nschmidt/project/parse_xml_for_VAE/xml_data/Aggregated_thalamic_nuclei.csv"

df = pd.read_csv(path_to_csv, header=[0, 1], index_col=0)

df_norm = normalize_and_scale_df(df=df)
df_norm


In [30]:
data = np.random.rand(5, 4)
columns = pd.MultiIndex.from_tuples([('A', 'val1'), ('A', 'val2'), ('B', 'val1'), ('B', 'val2')], names = ["Filename", "Volume"])
df_multi = pd.DataFrame(data, columns=columns)
df_multi.index = [["a", "b", "c", "d", "e"]]

# Slicing a DataFrame with MultiIndex Header

data_new = np.random.rand(5, 1)
col_new = pd.MultiIndex.from_tuples([('C', 'val1')], names = ["Filename", "Volume"])
df_new = pd.DataFrame(data_new, columns=col_new)
df_new.index = [["a", "b", "c", "d", "e"]]

if df_new.columns[0][0] not in df_multi.columns:
    df_concat = pd.concat([df_multi, df_new], axis = 1, join="inner")
    print(df_concat)




Filename         A                   B                   C
Volume        val1      val2      val1      val2      val1
a         0.165125  0.242904  0.806972  0.546446  0.535031
b         0.448052  0.878168  0.871098  0.427538  0.754350
c         0.604342  0.619856  0.212962  0.391066  0.944979
d         0.629539  0.630045  0.168249  0.276480  0.003716
e         0.958453  0.482889  0.850215  0.169090  0.940190
