In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.decomposition import PCA
from pathlib import Path


# Use the parent directory of this file for results
# results_dir = os.getcwd()
results_dir = "/net/trapnell/vol1/home/mdcolon/proj/morphseq/results/mcolon/20251013"
data_dir = os.path.join(results_dir, "data")
plot_dir = os.path.join(results_dir, "plots")

print(f"Results directory: {results_dir}")
os.makedirs(plot_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)


morphseq_root = os.environ.get('MORPHSEQ_REPO_ROOT')
if morphseq_root is None:
    morphseq_root = "/net/trapnell/vol1/home/mdcolon/proj/morphseq"

print(f"MORPHSEQ_REPO_ROOT: {morphseq_root}")
os.chdir(morphseq_root)

# from src.functions.embryo_df_performance_metrics import *
# from src.functions.spline_morph_spline_metrics import *

# Import TZ experiments
WT_experiments = ["20230615","20230531", "20230525", "20250912"] 

b9d2_experiments = ["20250519","20250520"]

cep290_experiments = ["20250305", "20250416", "20250512", "20250515_part2", "20250519"]

tmem67_experiments = ["20250711"]

experiments = WT_experiments + b9d2_experiments + cep290_experiments + tmem67_experiments

build06_dir = "/net/trapnell/vol1/home/mdcolon/proj/morphseq/morphseq_playground/metadata/build06_output"

# Load all experiments
dfs = []
for exp in experiments:
    try:
        file_path = f"{build06_dir}/df03_final_output_with_latents_{exp}.csv"
        df = pd.read_csv(file_path)
        df['source_experiment'] = exp
        print(df['genotype'].value_counts())
        dfs.append(df)
        print(f"Loaded {exp}: {len(df)} rows")
    except:
        print(f"Missing: {exp}")

# Combine all data
combined_df = pd.concat(dfs, ignore_index=True)
print(f"\nTotal: {len(combined_df)} rows from {len(dfs)} experiments")



Results directory: /net/trapnell/vol1/home/mdcolon/proj/morphseq/results/mcolon/20251013
MORPHSEQ_REPO_ROOT: /net/trapnell/vol1/home/mdcolon/proj/morphseq


In [None]:
def bin_by_embryo_time(
    df,
    time_col="predicted_stage_hpf",
    z_cols=None,
    bin_width=2.0,
    suffix="_binned"
):
    """
    Bin VAE embeddings by predicted time and embryo.

    Always averages embeddings per embryo_id × time_bin,
    keeping all non-latent metadata columns (e.g., genotype).

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing 'embryo_id', 'predicted_stage_hpf', and latent columns.
    time_col : str
        Column name to bin by.
    z_cols : list or None
        Columns to average. If None, auto-detect those containing 'z_mu_b'.
    bin_width : float
        Width of time bins (same units as time_col, usually hours).
    suffix : str
        Suffix to append to averaged latent column names.

    Returns
    -------
    pd.DataFrame
        One row per (embryo_id, time_bin) containing averaged latent columns and preserved metadata.
    """

    df = df.copy()

    # detect latent columns
    if z_cols is None:
        z_cols = [c for c in df.columns if "z_mu_b" in c]
        if not z_cols:
            raise ValueError("No latent columns found matching pattern 'z_mu_b'.")

    # create time bins
    df["time_bin"] = (np.floor(df[time_col] / bin_width) * bin_width).astype(int)

    # average latent vectors per embryo × time_bin
    agg = (
        df.groupby(["embryo_id", "time_bin"], as_index=False)[z_cols]
        .mean()
    )

    # rename averaged latent columns
    agg.rename(columns={c: f"{c}{suffix}" for c in z_cols}, inplace=True)

    # merge back non-latent metadata (take first unique per embryo)
    meta_cols = [c for c in df.columns if c not in z_cols + [time_col]]
    meta_df = (
        df[meta_cols]
        .drop_duplicates(subset=["embryo_id"])
        .set_index("embryo_id")
    )

    # merge metadata back in
    out = agg.merge(meta_df, on="embryo_id", how="left")

    # ensure sorting
    out = out.sort_values(["embryo_id", "time_bin"]).reset_index(drop=True)

    return out


def get_z_columns(df, z_cols=None, suffix="_binned"):
    
    """
    Identify latent (embedding) columns for analysis.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe (already binned by embryo/time).
    z_cols : list or None
        Optional explicit list. If None, automatically detect by suffix or 'z_mu_b' pattern.
    suffix : str
        Column suffix used in binning (default '_binned').

    Returns
    -------
    list
        Names of latent columns.
    """
    if z_cols is None:
        z_cols = [c for c in df.columns if c.endswith(suffix) or "z_mu_b" in c]
    if not z_cols:
        raise ValueError("No latent columns detected for analysis.")
    return z_cols



In [None]:
# -- helper stats --
from itertools import combinations
from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd
def energy_distance(X, Y):
    XY = cdist(X, Y).mean()
    XX = cdist(X, X).mean()
    YY = cdist(Y, Y).mean()
    return 2*XY - XX - YY

def energy_perm_test(X, Y, n_perm=500, rng=None):
    rng = np.random.default_rng(rng)
    obs = energy_distance(X, Y)
    Z = np.vstack([X, Y])
    nx = len(X)
    perm_stats = []
    for _ in range(n_perm):
        rng.shuffle(Z)
        perm_stats.append(energy_distance(Z[:nx], Z[nx:]))
    p = (np.sum(perm_stats >= obs) + 1) / (n_perm + 1)
    return obs, p

def hotellings_T2(X, Y):
    n, m = len(X), len(Y)
    mean_diff = X.mean(0) - Y.mean(0)
    Sx = LedoitWolf().fit(X).covariance_
    Sy = LedoitWolf().fit(Y).covariance_
    Sp = ((n-1)*Sx + (m-1)*Sy) / (n+m-2)
    invSp = np.linalg.pinv(Sp)
    return (n*m)/(n+m) * float(mean_diff @ invSp @ mean_diff)
