In [31]:
import pandas as pd

In [32]:
import yaml

# Path to your YAML file
file_path = "/net/vast-storage.ib.cluster/scratch/vast/lhtsai/rraju/data/moseq_data/moseq2-index.yaml"

# Open and load the YAML file
with open(file_path, "r") as f:
    data = yaml.safe_load(f)

# Inspect the contents
#print(data)

# If you want a prettier view
import pprint
pprint.pprint(data)


{'files': [{'group': 'default',
            'metadata': {'ColorDataType': 'Byte[]',
                         'ColorResolution': [512, 424],
                         'DepthDataType': 'UInt16[]',
                         'DepthResolution': [512, 424],
                         'IsLittleEndian': True,
                         'NidaqChannels': 0,
                         'NidaqSamplingRate': 0.0,
                         'SessionName': 'Mom1_030624',
                         'StartTime': '2024-03-06T11:41:43.2004382-05:00',
                         'SubjectName': 'Mom1_LNB'},
            'path': ['/om2/user/rraju/data/moseq_data/aggregate_results/2024-03-06_11-41-43_mom1_030624_mom1_lnb_results_00.h5',
                     '/om2/user/rraju/data/moseq_data/aggregate_results/2024-03-06_11-41-43_mom1_030624_mom1_lnb_results_00.yaml'],
            'uuid': '9c7c5d54-67eb-49d4-aeb0-2039b53102a1'},
           {'group': 'default',
            'metadata': {'ColorDataType': 'Byte[]',
                

In [33]:
import os, json, re
from pathlib import Path
from dateutil import parser as dtparser

MOSEQ_BASE = Path("/om2/user/rraju/data/moseq_data")

# ---- index the session_* dirs on disk
session_rows = []
for sdir in MOSEQ_BASE.iterdir():
    if sdir.is_dir() and sdir.name.startswith("session_"):
        meta_path = sdir / "metadata.json"
        meta = {}
        if meta_path.exists():
            try:
                meta = json.loads(meta_path.read_text())
            except Exception:
                pass
        session_rows.append({
            "moseq_id": sdir.name,  # e.g., session_20240306113848
            "disk_SessionName": meta.get("SessionName"),
            "disk_SubjectName": meta.get("SubjectName"),
            "disk_StartTime": meta.get("StartTime"),
        })

sessions_df = pd.DataFrame(session_rows)

# ---- extract from YAML (uuid, SessionName, StartTime, and a timestamp from path)
def path_to_digits(paths):
    if not paths: return None
    m = re.search(r'(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})', " ".join(paths))
    if not m: return None
    return (m.group(1)+m.group(2)).replace("-", "")  # YYYYMMDDHHMMSS

def iso_to_digits(s):
    if not s: return None
    try:
        return dtparser.isoparse(s).strftime("%Y%m%d%H%M%S")
    except Exception:
        return None

yaml_rows = []
for f in data.get("files", []):
    meta = f.get("metadata", {}) or {}
    dig_from_path = path_to_digits(f.get("path", []))
    dig_from_iso  = iso_to_digits(meta.get("StartTime"))
    yaml_rows.append({
        "yaml_uuid": f.get("uuid"),
        "yaml_SessionName": meta.get("SessionName"),
        "yaml_SubjectName": meta.get("SubjectName"),
        "yaml_StartTime": meta.get("StartTime"),
        "ts_digits_from_path": dig_from_path,
        "ts_digits_from_iso": dig_from_iso,
    })

yaml_df = pd.DataFrame(yaml_rows)

# sessions_df also needs a ts_digits to align by time
def disk_iso_to_digits(s):
    if not s: return None
    try:
        return dtparser.isoparse(s).strftime("%Y%m%d%H%M%S")
    except Exception:
        return None

sessions_df["ts_digits_disk"] = sessions_df["disk_StartTime"].map(disk_iso_to_digits)

# ---- 3 passes to connect YAML -> session_*
# A) by timestamp from YAML path
m_by_path = yaml_df.merge(
    sessions_df, left_on="ts_digits_from_path", right_on="ts_digits_disk", how="left"
)

# B) fill remaining by StartTime ISO
mask_unmatched = m_by_path["moseq_id"].isna()
m_by_iso = m_by_path.loc[mask_unmatched].drop(columns=["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]).merge(
    sessions_df, left_on="ts_digits_from_iso", right_on="ts_digits_disk", how="left"
)
m_by_path.loc[mask_unmatched, ["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]] = \
    m_by_iso[["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]].values

# C) fill remaining by SessionName exact
mask_unmatched = m_by_path["moseq_id"].isna() & m_by_path["yaml_SessionName"].notna()
m_by_name = m_by_path.loc[mask_unmatched].drop(columns=["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]).merge(
    sessions_df, left_on="yaml_SessionName", right_on="disk_SessionName", how="left"
)
m_by_path.loc[mask_unmatched, ["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]] = \
    m_by_name[["moseq_id","disk_SessionName","disk_SubjectName","disk_StartTime","ts_digits_disk"]].values

# This is our YAML→MoSeq lookup
yaml_to_moseq = m_by_path[[
    "yaml_uuid", "yaml_SessionName", "yaml_SubjectName", "yaml_StartTime", "moseq_id"
]].rename(columns={"yaml_uuid":"uuid"})
print("Built YAML→MoSeq table. Rows:", len(yaml_to_moseq))
print("Matched sessions:", yaml_to_moseq['moseq_id'].notna().sum())


Built YAML→MoSeq table. Rows: 76
Matched sessions: 76


In [34]:
moseq_df = pd.read_csv('../../data/moseq_df.csv')

# Males

In [2]:
males_df = pd.read_csv('../../data/analysis_output/constructed_fingerprints/Males/offspring_PCA25_embedding.csv.gz')

In [3]:
males_df

Unnamed: 0,uuid,category,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25
0,b337bf67-7270-42f5-8b72-78dd050c5145,LNB,-10.488839,-3.002693,2.905568,-1.128349,0.443569,-1.59762,3.778469,2.007455,...,-0.904756,-0.27716,7.351598,-2.582574,0.951303,-0.743961,-3.552512,-3.200069,0.518018,1.143666
1,744530d8-f277-48b3-8627-39c69e04bcde,LNB,6.768376,-6.057944,-0.305252,-2.608399,-4.24953,1.001843,-3.45249,2.527666,...,-0.963755,-1.507169,-2.058443,-0.053575,-0.451844,-2.301998,1.769055,-0.044093,-2.469546,-0.191657
2,5083529c-320b-4bea-9ceb-d119b2594f13,NGH,-4.023883,-4.397324,-0.216554,-2.016008,-4.304283,1.271473,7.915787,-3.023034,...,6.002137,0.993517,-0.176488,-0.07351,2.526731,0.709873,-0.754943,4.016852,2.676931,1.084237
3,6c06a0b7-7ca1-45ef-bb49-6a4589681bf4,NGH,1.565465,-0.103222,-4.465333,2.642781,-3.660184,3.492425,3.454603,-2.102157,...,-1.734018,0.792291,-0.778199,2.755474,1.871567,3.304623,-0.016427,-2.875565,-0.03104,-3.489109
4,72b39b1e-5973-43e2-bedb-0f00ace4848b,NGH,-4.905071,4.32494,8.160634,-5.190738,-4.809472,-5.84477,0.919767,7.314685,...,0.737489,5.315772,-0.986452,0.230265,2.404228,4.712797,1.619337,1.378117,0.613474,1.505021
5,be2c69ec-d004-4d52-b844-13e9634737ed,SI,-4.389551,-1.807877,1.482355,8.286113,9.191115,5.18331,3.2522,3.286561,...,1.651801,-4.201803,-0.908862,0.195122,-1.477991,2.606475,-1.532929,0.329716,0.156254,0.606976
6,8425e949-99ab-4e71-baf9-5e653eb68223,NGH,0.503875,-3.760428,-4.447832,-5.21991,4.4292,0.663011,-0.743466,-2.966234,...,-0.392355,1.993094,-3.357,-1.674644,-1.867511,0.481841,1.835003,-1.359058,-2.269827,7.296103
7,e235873b-f363-4a61-8a8c-3e935da3ed3d,LNB,1.223713,-5.039052,-3.133227,0.24915,-3.392989,0.04399,1.714035,-1.361532,...,-1.671581,-1.957676,-0.31636,-0.10957,-1.605514,2.85044,2.692665,-1.46447,1.651568,-1.986215
8,a30299ff-63d0-4c08-9b26-0cc143eef34b,LNB,8.607919,-1.724041,-6.601534,-8.492956,7.224147,-3.659863,-4.007939,-1.888752,...,5.789947,-2.991796,0.858533,2.752391,-1.884266,-1.394457,-2.550549,0.370215,1.550691,0.353148
9,774ba130-08f1-45ee-9c61-636e6de87827,SI,-13.315558,-6.063129,3.998677,4.385434,5.828232,-0.810497,3.810909,-1.307332,...,-2.198445,3.617548,-1.531329,-0.453886,0.532233,-3.945409,0.148655,1.17064,-3.714841,-2.219023


In [23]:
from pathlib import Path
import json
import pandas as pd
from dateutil import parser as dtparser

MOSEQ_BASE = Path("/om2/user/rraju/data/moseq_data")

# ── 1) Build SessionName -> session_* (dedup to one per name, pick latest by StartTime)
rows = []
for sdir in MOSEQ_BASE.iterdir():
    if sdir.is_dir() and sdir.name.startswith("session_"):
        meta_path = sdir / "metadata.json"
        if not meta_path.exists():
            continue
        try:
            meta = json.loads(meta_path.read_text())
        except Exception:
            continue
        sess_name = meta.get("SessionName")
        start_iso = meta.get("StartTime")
        try:
            start_dt = dtparser.isoparse(start_iso) if start_iso else None
        except Exception:
            start_dt = None
        if sess_name:
            rows.append({
                "SessionName": sess_name,
                "moseq_id": sdir.name,
                "Start_dt": start_dt,
            })

sessions_df = pd.DataFrame(rows)
if sessions_df.empty:
    raise RuntimeError("No session_* metadata found under MOSEQ_BASE")

# Normalize SessionName to avoid tiny formatting mismatches
sessions_df["SessionName_norm"] = sessions_df["SessionName"].str.strip().str.lower()

# For duplicate names, keep the latest by StartTime
sessions_df = (
    sessions_df.sort_values(["SessionName_norm","Start_dt"])
               .groupby("SessionName_norm", as_index=False)
               .tail(1)
               .loc[:, ["SessionName_norm","moseq_id"]]
)

# ── 2) Reduce moseq_df to ONE row per uuid -> SessionName (resolve duplicates)
# Keep only the two columns we need, drop exact dup rows, normalize names.
mmap = (moseq_df.loc[:, ["uuid","SessionName"]]
                 .dropna()
                 .drop_duplicates())
mmap["SessionName_norm"] = mmap["SessionName"].str.strip().str.lower()

# How many uuids map to >1 (normalized) names?
dup_counts = mmap.groupby("uuid")["SessionName_norm"].nunique()
conflict_uuids = dup_counts[dup_counts > 1].index.tolist()

if conflict_uuids:
    # Resolve conflicts by selecting the SessionName that actually exists on disk;
    # if multiple exist, prefer the one whose session_* is the latest (already encoded in sessions_df).
    candidates = (mmap[mmap["uuid"].isin(conflict_uuids)]
                    .merge(sessions_df, on="SessionName_norm", how="left"))

    # Rank: keep those that matched a session_*, drop non-matching; then pick one per uuid
    candidates["has_session"] = candidates["moseq_id"].notna()
    resolved = (candidates[candidates["has_session"]]
                .drop_duplicates(subset=["uuid","SessionName_norm"])
                .groupby("uuid", as_index=False)
                .first()[["uuid","SessionName_norm"]])

    # Replace conflicting uuids in mmap with the resolved pick
    keep_simple = mmap[~mmap["uuid"].isin(conflict_uuids)]
    mmap = pd.concat([keep_simple, resolved], ignore_index=True)

# Ensure one row per uuid now
assert mmap["uuid"].is_unique, "Still not one row per uuid after resolution."

# ── 3) Map uuid -> session_* via SessionName_norm
uuid_to_moseq = mmap.merge(sessions_df, on="SessionName_norm", how="left")[["uuid","moseq_id"]]

# Optional sanity checks
n_total = len(males_df)
n_have_uuid = males_df["uuid"].notna().sum()
n_resolvable = uuid_to_moseq["moseq_id"].notna().sum()
print(f"Unique uuid in mapping: {len(uuid_to_moseq)} (with moseq_id for {n_resolvable})")

# ── 4) Final one-to-one merge onto males_df (and validate)
# pandas validate raises if many-to-many would occur
result = males_df.merge(uuid_to_moseq, on="uuid", how="left", validate="one_to_one")

print(f"Rows in males_df: {n_total}")
print(f"Mapped to session_*: {result['moseq_id'].notna().sum()}/{n_total}")

# Show any that failed to map
if result["moseq_id"].isna().any():
    print("\nUnmapped uuids (head):")
    print(result.loc[result["moseq_id"].isna(), "uuid"].head())


Unique uuid in mapping: 65 (with moseq_id for 65)
Rows in males_df: 33
Mapped to session_*: 33/33


In [24]:
result

Unnamed: 0,uuid,category,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,moseq_id
0,b337bf67-7270-42f5-8b72-78dd050c5145,LNB,-10.488839,-3.002693,2.905568,-1.128349,0.443569,-1.59762,3.778469,2.007455,...,-0.27716,7.351598,-2.582574,0.951303,-0.743961,-3.552512,-3.200069,0.518018,1.143666,session_20240501141330
1,744530d8-f277-48b3-8627-39c69e04bcde,LNB,6.768376,-6.057944,-0.305252,-2.608399,-4.24953,1.001843,-3.45249,2.527666,...,-1.507169,-2.058443,-0.053575,-0.451844,-2.301998,1.769055,-0.044093,-2.469546,-0.191657,session_20240501150902
2,5083529c-320b-4bea-9ceb-d119b2594f13,NGH,-4.023883,-4.397324,-0.216554,-2.016008,-4.304283,1.271473,7.915787,-3.023034,...,0.993517,-0.176488,-0.07351,2.526731,0.709873,-0.754943,4.016852,2.676931,1.084237,session_20240501173310
3,6c06a0b7-7ca1-45ef-bb49-6a4589681bf4,NGH,1.565465,-0.103222,-4.465333,2.642781,-3.660184,3.492425,3.454603,-2.102157,...,0.792291,-0.778199,2.755474,1.871567,3.304623,-0.016427,-2.875565,-0.03104,-3.489109,session_20240501123937
4,72b39b1e-5973-43e2-bedb-0f00ace4848b,NGH,-4.905071,4.32494,8.160634,-5.190738,-4.809472,-5.84477,0.919767,7.314685,...,5.315772,-0.986452,0.230265,2.404228,4.712797,1.619337,1.378117,0.613474,1.505021,session_20240322161944
5,be2c69ec-d004-4d52-b844-13e9634737ed,SI,-4.389551,-1.807877,1.482355,8.286113,9.191115,5.18331,3.2522,3.286561,...,-4.201803,-0.908862,0.195122,-1.477991,2.606475,-1.532929,0.329716,0.156254,0.606976,session_20240321164046
6,8425e949-99ab-4e71-baf9-5e653eb68223,NGH,0.503875,-3.760428,-4.447832,-5.21991,4.4292,0.663011,-0.743466,-2.966234,...,1.993094,-3.357,-1.674644,-1.867511,0.481841,1.835003,-1.359058,-2.269827,7.296103,session_20240501110509
7,e235873b-f363-4a61-8a8c-3e935da3ed3d,LNB,1.223713,-5.039052,-3.133227,0.24915,-3.392989,0.04399,1.714035,-1.361532,...,-1.957676,-0.31636,-0.10957,-1.605514,2.85044,2.692665,-1.46447,1.651568,-1.986215,session_20240501160602
8,a30299ff-63d0-4c08-9b26-0cc143eef34b,LNB,8.607919,-1.724041,-6.601534,-8.492956,7.224147,-3.659863,-4.007939,-1.888752,...,-2.991796,0.858533,2.752391,-1.884266,-1.394457,-2.550549,0.370215,1.550691,0.353148,session_20240501144135
9,774ba130-08f1-45ee-9c61-636e6de87827,SI,-13.315558,-6.063129,3.998677,4.385434,5.828232,-0.810497,3.810909,-1.307332,...,3.617548,-1.531329,-0.453886,0.532233,-3.945409,0.148655,1.17064,-3.714841,-2.219023,session_20240321170920


In [26]:
result.to_csv('../../data/analysis_output/constructed_fingerprints/Males/final_data/males_final_data.csv')

# Females

In [27]:
males_df = pd.read_csv('../../data/analysis_output/constructed_fingerprints/Females/offspring_PCA25_embedding.csv.gz')

In [28]:
from pathlib import Path
import json
import pandas as pd
from dateutil import parser as dtparser

MOSEQ_BASE = Path("/om2/user/rraju/data/moseq_data")

# ── 1) Build SessionName -> session_* (dedup to one per name, pick latest by StartTime)
rows = []
for sdir in MOSEQ_BASE.iterdir():
    if sdir.is_dir() and sdir.name.startswith("session_"):
        meta_path = sdir / "metadata.json"
        if not meta_path.exists():
            continue
        try:
            meta = json.loads(meta_path.read_text())
        except Exception:
            continue
        sess_name = meta.get("SessionName")
        start_iso = meta.get("StartTime")
        try:
            start_dt = dtparser.isoparse(start_iso) if start_iso else None
        except Exception:
            start_dt = None
        if sess_name:
            rows.append({
                "SessionName": sess_name,
                "moseq_id": sdir.name,
                "Start_dt": start_dt,
            })

sessions_df = pd.DataFrame(rows)
if sessions_df.empty:
    raise RuntimeError("No session_* metadata found under MOSEQ_BASE")

# Normalize SessionName to avoid tiny formatting mismatches
sessions_df["SessionName_norm"] = sessions_df["SessionName"].str.strip().str.lower()

# For duplicate names, keep the latest by StartTime
sessions_df = (
    sessions_df.sort_values(["SessionName_norm","Start_dt"])
               .groupby("SessionName_norm", as_index=False)
               .tail(1)
               .loc[:, ["SessionName_norm","moseq_id"]]
)

# ── 2) Reduce moseq_df to ONE row per uuid -> SessionName (resolve duplicates)
# Keep only the two columns we need, drop exact dup rows, normalize names.
mmap = (moseq_df.loc[:, ["uuid","SessionName"]]
                 .dropna()
                 .drop_duplicates())
mmap["SessionName_norm"] = mmap["SessionName"].str.strip().str.lower()

# How many uuids map to >1 (normalized) names?
dup_counts = mmap.groupby("uuid")["SessionName_norm"].nunique()
conflict_uuids = dup_counts[dup_counts > 1].index.tolist()

if conflict_uuids:
    # Resolve conflicts by selecting the SessionName that actually exists on disk;
    # if multiple exist, prefer the one whose session_* is the latest (already encoded in sessions_df).
    candidates = (mmap[mmap["uuid"].isin(conflict_uuids)]
                    .merge(sessions_df, on="SessionName_norm", how="left"))

    # Rank: keep those that matched a session_*, drop non-matching; then pick one per uuid
    candidates["has_session"] = candidates["moseq_id"].notna()
    resolved = (candidates[candidates["has_session"]]
                .drop_duplicates(subset=["uuid","SessionName_norm"])
                .groupby("uuid", as_index=False)
                .first()[["uuid","SessionName_norm"]])

    # Replace conflicting uuids in mmap with the resolved pick
    keep_simple = mmap[~mmap["uuid"].isin(conflict_uuids)]
    mmap = pd.concat([keep_simple, resolved], ignore_index=True)

# Ensure one row per uuid now
assert mmap["uuid"].is_unique, "Still not one row per uuid after resolution."

# ── 3) Map uuid -> session_* via SessionName_norm
uuid_to_moseq = mmap.merge(sessions_df, on="SessionName_norm", how="left")[["uuid","moseq_id"]]

# Optional sanity checks
n_total = len(males_df)
n_have_uuid = males_df["uuid"].notna().sum()
n_resolvable = uuid_to_moseq["moseq_id"].notna().sum()
print(f"Unique uuid in mapping: {len(uuid_to_moseq)} (with moseq_id for {n_resolvable})")

# ── 4) Final one-to-one merge onto males_df (and validate)
# pandas validate raises if many-to-many would occur
result = males_df.merge(uuid_to_moseq, on="uuid", how="left", validate="one_to_one")

print(f"Rows in males_df: {n_total}")
print(f"Mapped to session_*: {result['moseq_id'].notna().sum()}/{n_total}")

# Show any that failed to map
if result["moseq_id"].isna().any():
    print("\nUnmapped uuids (head):")
    print(result.loc[result["moseq_id"].isna(), "uuid"].head())


Unique uuid in mapping: 65 (with moseq_id for 65)
Rows in males_df: 33
Mapped to session_*: 33/33


In [29]:
result

Unnamed: 0,uuid,category,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,moseq_id
0,add943f9-2caf-47ff-80d1-b825d3419b37,NGH,-1.481707,24.608028,-7.444133,-11.30782,1.527056,-4.153057,7.359201,-0.383851,...,-2.731629,-1.335388,-0.482134,-0.03063,-1.714362,2.355093,-0.883881,-0.103883,-0.97722,session_20240321133542
1,6271b7ca-21da-475e-aa9a-021f3fbfe725,NGH,1.161096,-6.299882,-6.602607,5.839993,5.535388,-0.690111,0.295096,-1.636198,...,0.703756,-0.172945,2.881554,2.121607,0.410574,1.502224,0.272877,4.66384,1.60872,session_20240501120816
2,d5fe97bf-c8c6-4400-bd91-1a355888d818,SI,-3.384283,-1.459206,0.773665,1.158345,4.387953,3.026842,0.819021,-0.984005,...,2.586884,1.67008,-1.215563,0.020143,-0.200605,2.224844,-0.746591,0.503098,-1.329001,session_20240322121426
3,f32cc80e-36a8-4cf6-823e-478f8eb4af38,NGH,2.422226,-2.227157,-3.07528,-0.414893,2.584938,-2.128171,0.813936,1.743914,...,0.667373,-4.405123,0.008528,3.984557,-3.07295,-3.70411,-1.519545,3.643564,-2.591354,session_20240501170444
4,3ac8d60e-8a2c-4244-9761-658ef5682856,LNB,-0.51902,-12.114787,1.317857,-4.017846,-3.581938,-0.405399,1.543203,0.491111,...,-2.898086,-0.316968,0.722288,-0.885345,-2.015325,1.81144,-0.525129,0.207496,-0.29286,session_20240501131605
5,2a84130d-6477-4fcc-a644-eb0527024917,NGH,2.086985,-1.455639,5.885801,2.743016,-2.743319,0.50626,-2.984237,6.758544,...,-1.035384,-2.275319,2.28653,0.409325,-4.236482,-0.516619,-4.801927,-0.078378,2.174755,session_20240322155235
6,9af80187-1f8b-4a78-bdf6-9cdcaea7c7a8,EE,7.423226,1.347147,-5.62654,3.149399,-0.528119,0.709819,-3.528038,1.813236,...,-0.052459,1.749549,-3.414038,-2.853426,3.965361,-2.831411,0.609777,1.692262,1.688363,session_20240321143422
7,774ba130-08f1-45ee-9c61-636e6de87827,SI,-13.315558,-6.063129,3.998677,4.385434,5.828232,-0.810497,3.810909,-1.307332,...,3.617548,-1.531329,-0.453886,0.532233,-3.945409,0.148655,1.17064,-3.714841,-2.219023,session_20240321170920
8,3f11e55d-f6ca-4f1e-889b-194e312985a1,LNB,-5.336033,-7.604247,-1.471839,-2.585326,-2.429485,-1.84839,-1.461386,-0.093698,...,1.533632,-0.696546,-0.835785,-0.835849,0.573735,1.199711,-1.053951,-2.261023,-1.460987,session_20240501134504
9,5083529c-320b-4bea-9ceb-d119b2594f13,NGH,-4.023883,-4.397324,-0.216554,-2.016008,-4.304283,1.271473,7.915787,-3.023034,...,0.993517,-0.176488,-0.07351,2.526731,0.709873,-0.754943,4.016852,2.676931,1.084237,session_20240501173310


In [30]:
result.to_csv('../../data/analysis_output/constructed_fingerprints/Females/final_data/females_final_data.csv')

# Moms

In [36]:
males_df = pd.read_csv('../../data/analysis_output/constructed_fingerprints/Moms/offspring_PCA8_embedding.csv.gz')

In [37]:
from pathlib import Path
import json
import pandas as pd
from dateutil import parser as dtparser

MOSEQ_BASE = Path("/om2/user/rraju/data/moseq_data")

# ── 1) Build SessionName -> session_* (dedup to one per name, pick latest by StartTime)
rows = []
for sdir in MOSEQ_BASE.iterdir():
    if sdir.is_dir() and sdir.name.startswith("session_"):
        meta_path = sdir / "metadata.json"
        if not meta_path.exists():
            continue
        try:
            meta = json.loads(meta_path.read_text())
        except Exception:
            continue
        sess_name = meta.get("SessionName")
        start_iso = meta.get("StartTime")
        try:
            start_dt = dtparser.isoparse(start_iso) if start_iso else None
        except Exception:
            start_dt = None
        if sess_name:
            rows.append({
                "SessionName": sess_name,
                "moseq_id": sdir.name,
                "Start_dt": start_dt,
            })

sessions_df = pd.DataFrame(rows)
if sessions_df.empty:
    raise RuntimeError("No session_* metadata found under MOSEQ_BASE")

# Normalize SessionName to avoid tiny formatting mismatches
sessions_df["SessionName_norm"] = sessions_df["SessionName"].str.strip().str.lower()

# For duplicate names, keep the latest by StartTime
sessions_df = (
    sessions_df.sort_values(["SessionName_norm","Start_dt"])
               .groupby("SessionName_norm", as_index=False)
               .tail(1)
               .loc[:, ["SessionName_norm","moseq_id"]]
)

# ── 2) Reduce moseq_df to ONE row per uuid -> SessionName (resolve duplicates)
# Keep only the two columns we need, drop exact dup rows, normalize names.
mmap = (moseq_df.loc[:, ["uuid","SessionName"]]
                 .dropna()
                 .drop_duplicates())
mmap["SessionName_norm"] = mmap["SessionName"].str.strip().str.lower()

# How many uuids map to >1 (normalized) names?
dup_counts = mmap.groupby("uuid")["SessionName_norm"].nunique()
conflict_uuids = dup_counts[dup_counts > 1].index.tolist()

if conflict_uuids:
    # Resolve conflicts by selecting the SessionName that actually exists on disk;
    # if multiple exist, prefer the one whose session_* is the latest (already encoded in sessions_df).
    candidates = (mmap[mmap["uuid"].isin(conflict_uuids)]
                    .merge(sessions_df, on="SessionName_norm", how="left"))

    # Rank: keep those that matched a session_*, drop non-matching; then pick one per uuid
    candidates["has_session"] = candidates["moseq_id"].notna()
    resolved = (candidates[candidates["has_session"]]
                .drop_duplicates(subset=["uuid","SessionName_norm"])
                .groupby("uuid", as_index=False)
                .first()[["uuid","SessionName_norm"]])

    # Replace conflicting uuids in mmap with the resolved pick
    keep_simple = mmap[~mmap["uuid"].isin(conflict_uuids)]
    mmap = pd.concat([keep_simple, resolved], ignore_index=True)

# Ensure one row per uuid now
assert mmap["uuid"].is_unique, "Still not one row per uuid after resolution."

# ── 3) Map uuid -> session_* via SessionName_norm
uuid_to_moseq = mmap.merge(sessions_df, on="SessionName_norm", how="left")[["uuid","moseq_id"]]

# Optional sanity checks
n_total = len(males_df)
n_have_uuid = males_df["uuid"].notna().sum()
n_resolvable = uuid_to_moseq["moseq_id"].notna().sum()
print(f"Unique uuid in mapping: {len(uuid_to_moseq)} (with moseq_id for {n_resolvable})")

# ── 4) Final one-to-one merge onto males_df (and validate)
# pandas validate raises if many-to-many would occur
result = males_df.merge(uuid_to_moseq, on="uuid", how="left", validate="one_to_one")

print(f"Rows in males_df: {n_total}")
print(f"Mapped to session_*: {result['moseq_id'].notna().sum()}/{n_total}")

# Show any that failed to map
if result["moseq_id"].isna().any():
    print("\nUnmapped uuids (head):")
    print(result.loc[result["moseq_id"].isna(), "uuid"].head())


Unique uuid in mapping: 65 (with moseq_id for 65)
Rows in males_df: 11
Mapped to session_*: 0/11

Unmapped uuids (head):
0    9f45197b-4e21-4ee3-9e84-3fde2831d7d4
1    d58916e7-fff3-4763-8ac0-8637f3fe512f
2    365247e5-d810-4242-87f4-1c144f1a211f
3    06f178be-3ba4-4c4b-b795-05703f6a3e0e
4    0c39c4f8-ffce-4ba9-b5c0-6fa221762363
Name: uuid, dtype: object
