In [8]:
import os
import pandas as pd
from pathlib import Path

# Define the directory path
data_dir = Path("../aligned_timeseries_FINS")

In [None]:
# Get all CSV files matching the pattern
matching_files = [f for f in data_dir.glob("FINS_*c_*_filtered.csv")]

# List to store files with post_chat
files_with_post_chat = []

# Check each file for post_chat in condition column
for file_path in matching_files:
    try:
        df = pd.read_csv(file_path)
        if 'condition' in df.columns:
            if 'post_chat' in df['condition'].values:
                files_with_post_chat.append(file_path.name)
    except Exception as e:
        print(f"Error reading {file_path.name}: {e}")

# Print results
print(f"Total matching files: {len(matching_files)}")
print(f"Files with 'post_chat': {len(files_with_post_chat)}")
print("\nFiles containing 'post_chat' in condition column:")
for filename in sorted(files_with_post_chat):
    print(f"  - {filename}")

In [28]:
sample_df = pd.read_csv('../aligned_timeseries_FINS/FINS_056c_aligned_timeseries_filtered.csv')

## Plan for fitting two independent HMMs (child1 HMM, child2 HMM), then compare state sequences

### Step 1 — Load both CSVs

### Step 2 — Filter to `post_chat`

### Step 3 — Select HbO columns only → build X

### Step 4 — EDA on filtered data

* duration (sec/min)
* sampling rate (~10 Hz)
* missingness rate (fraction of NaNs)
* number of HbO channels

### Step 5 — Standardize X (mean 0, std 1 per HbO channel)

### Step 6 — Fit HMM on each file (same K)

* start with K = 4..10
* pick K later by BIC/stability; for now pick a reasonable K like 6

### Step 7 — Align state labels across the two HMMs (Hungarian)

* align by similarity of state mean patterns

### Step 8 — Compare aligned outputs

* fractional occupancy per aligned state
* dwell time per aligned state

Focused on:

* `condition == "post_chat"`
* **HbO only**
* EDA: minutes, sampling rate, missingness
* standardization (recommended for HMM stability)
* fit HMM to each file
* align states between the two HMMs using **Hungarian algorithm**

In [29]:
!pip -q install hmmlearn scikit-learn scipy pandas numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from hmmlearn.hmm import GaussianHMM
from scipy.optimize import linear_sum_assignment

### EDA Helpers

In [10]:
def get_hbo_columns(df):
    # Matches columns ending with ' hbo' (case-insensitive)
    return [c for c in df.columns if c.lower().endswith(" hbo")]

def missingness_rate(X):
    # fraction of NaNs in the whole matrix
    return float(np.isnan(X).mean())

def eda_postchat_hbo(df, name="file"):
    """
    Filter to post_chat condition, extract HbO columns, and report EDA stats.
    Assumes 10 Hz sampling rate.
    """
    if "condition" not in df.columns:
        raise ValueError(f"{name}: no 'condition' column found.")
    
    # Show available conditions for debugging
    print(f"\nAvailable conditions in {name}: {list(pd.unique(df['condition']))}")
    
    # Filter to post_chat
    d = df[df["condition"] == "post_chat"].copy()
    
    if len(d) == 0:
        raise ValueError(f"{name}: no rows found with condition == 'post_chat'.")
    
    # Get HbO columns
    hbo_cols = get_hbo_columns(d)
    if len(hbo_cols) == 0:
        raise ValueError(f"{name}: no HbO columns found after filtering to post_chat.")
    
    # Extract data
    X = d[hbo_cols].astype(float).values
    
    # Calculate duration from row count (10 Hz sampling)
    sampling_hz = 10.0
    duration_sec = len(d) / sampling_hz
    duration_min = duration_sec / 60.0
    
    # Calculate missingness
    miss = missingness_rate(X)
    
    # Print EDA summary
    print(f"\n=== EDA (post_chat, HbO only): {name} ===")
    print(f"Total rows in file: {len(df)}")
    print(f"Post_chat rows: {len(d)}")
    print(f"Percentage: {100 * len(d) / len(df):.1f}%")
    print(f"HbO channels (D): {len(hbo_cols)}")
    print(f"Duration: {duration_sec:.2f} sec (~{duration_min:.2f} min)")
    print(f"Missingness rate (NaN fraction): {miss:.6f}")
    
    # Return time column for compatibility (though not used downstream)
    t = d["time_sec"].values if "time_sec" in d.columns else np.arange(len(d)) / sampling_hz
    
    return d, hbo_cols, X, t

### Loading files + run post_chat HbO EDA

In [32]:
file_a = f"{data_dir}/FINS_056c_aligned_timeseries_filtered.csv"
file_b = f"{data_dir}/FINS_057c_aligned_timeseries_filtered.csv"

df_a = pd.read_csv(file_a)
df_b = pd.read_csv(file_b)

d_a, hbo_a, Xa, ta = eda_postchat_hbo(df_a, name="FINS_056c")
d_b, hbo_b, Xb, tb = eda_postchat_hbo(df_b, name="FINS_057c")

# Ensure we use the same HbO channels in both (intersection if needed)
common_hbo = sorted(list(set(hbo_a).intersection(set(hbo_b))))
print("\nCommon HbO channels:", len(common_hbo))

Xa = d_a[common_hbo].astype(float).values
Xb = d_b[common_hbo].astype(float).values


Available conditions in FINS_056c: ['pre_chat', 'easy_talking', 'easy_silence', 'med_talking', 'med_silence', 'hard_talking', 'hard_silence', 'post_chat']

=== EDA (post_chat, HbO only): FINS_056c ===
Total rows in file: 5679
Post_chat rows: 799
Percentage: 14.1%
HbO channels (D): 22
Duration: 79.90 sec (~1.33 min)
Missingness rate (NaN fraction): 0.000000

Available conditions in FINS_057c: ['pre_chat', 'easy_talking', 'easy_silence', 'med_talking', 'med_silence', 'hard_talking', 'hard_silence', 'post_chat']

=== EDA (post_chat, HbO only): FINS_057c ===
Total rows in file: 5699
Post_chat rows: 688
Percentage: 12.1%
HbO channels (D): 22
Duration: 68.80 sec (~1.15 min)
Missingness rate (NaN fraction): 0.000000

Common HbO channels: 22


### Standardize

In [33]:
def standardize(X):
    scaler = StandardScaler()
    Xz = scaler.fit_transform(X)
    return Xz, scaler

Xa_z, scaler_a = standardize(Xa)
Xb_z, scaler_b = standardize(Xb)


### Fit HMM (no PCA)

In [34]:
def fit_hmm(Xz, K, seed=0, n_iter=500):
    model = GaussianHMM(
        n_components=K,
        covariance_type="diag",
        n_iter=n_iter,
        tol=1e-3,
        random_state=seed
    )
    model.fit(Xz)
    z = model.predict(Xz)
    return model, z

K = 6  # starter value; we can sweep K later

model_a, z_a = fit_hmm(Xa_z, K=K, seed=0)
model_b, z_b = fit_hmm(Xb_z, K=K, seed=0)

### Hungarian alignment (align B’s states to A’s labels)
We align using cosine similarity of the state mean vectors (`model.means_`), which are in the standardized feature space here.

In [35]:
def cosine_similarity_matrix(A, B, eps=1e-9):
    A_norm = A / (np.linalg.norm(A, axis=1, keepdims=True) + eps)
    B_norm = B / (np.linalg.norm(B, axis=1, keepdims=True) + eps)
    return A_norm @ B_norm.T

def hungarian_align(means_a, means_b):
    S = cosine_similarity_matrix(means_a, means_b)
    cost = -S  # maximize similarity
    row_ind, col_ind = linear_sum_assignment(cost)
    # col_ind[a_state] = matched b_state
    return col_ind, S

def remap_states_b_to_a(z_b, mapping_a_to_b):
    # Build inverse map: inv[b_state] = a_state
    inv = np.zeros_like(mapping_a_to_b)
    for a_state, b_state in enumerate(mapping_a_to_b):
        inv[b_state] = a_state
    return inv[z_b]

mapping_a_to_b, sim = hungarian_align(model_a.means_, model_b.means_)
z_b_aligned = remap_states_b_to_a(z_b, mapping_a_to_b)

print("Mapping (A_state -> B_state):", mapping_a_to_b.tolist())
print("Avg matched similarity:", float(np.mean([sim[a, mapping_a_to_b[a]] for a in range(K)])))

Mapping (A_state -> B_state): [4, 5, 3, 2, 1, 0]
Avg matched similarity: 0.27999338086159187


### Compare aligned occupancy + dwell

In [36]:
def fractional_occupancy(z, K):
    return np.bincount(z, minlength=K) / len(z)

def mean_dwell(z, K):
    dw = {k: [] for k in range(K)}
    s, run = z[0], 1
    for i in range(1, len(z)):
        if z[i] == s:
            run += 1
        else:
            dw[s].append(run)
            s, run = z[i], 1
    dw[s].append(run)
    return np.array([np.mean(dw[k]) if dw[k] else 0.0 for k in range(K)])

fo_a = fractional_occupancy(z_a, K)
fo_b = fractional_occupancy(z_b_aligned, K)

dt_a = mean_dwell(z_a, K)
dt_b = mean_dwell(z_b_aligned, K)

print("\nFO A:", np.round(fo_a, 3))
print("FO B (aligned):", np.round(fo_b, 3))
print("Mean dwell A (steps):", np.round(dt_a, 1))
print("Mean dwell B (aligned, steps):", np.round(dt_b, 1))


FO A: [0.218 0.242 0.081 0.091 0.215 0.153]
FO B (aligned): [0.113 0.215 0.126 0.188 0.166 0.192]
Mean dwell A (steps): [ 87.   96.5  32.5  24.3  57.3 122. ]
Mean dwell B (aligned, steps): [ 26.  148.   87.   21.5  28.5  22. ]


## HMM Tryout Baselines

In [11]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from hmmlearn.hmm import GaussianHMM

from scipy.optimize import linear_sum_assignment
from numpy.linalg import norm

# ----------------------------
# Helpers: selection / loading
# ----------------------------
def load_postchat_hbo_matrix(csv_path: Path):
    """
    Returns:
      subject_id (str or None), T (int), hbo_cols (list[str]), X (np.ndarray shape T x D)
    """
    df = pd.read_csv(csv_path)
    if "condition" not in df.columns:
        raise ValueError(f"{csv_path.name}: no 'condition' column")

    d = df[df["condition"] == "post_chat"].copy()
    hbo_cols = get_hbo_columns(d)
    if len(hbo_cols) == 0:
        raise ValueError(f"{csv_path.name}: no HbO columns found")

    X = d[hbo_cols].astype(float).to_numpy()
    subject_id = None
    if "subject_id" in d.columns and len(d) > 0:
        subject_id = str(d["subject_id"].iloc[0])

    return subject_id, len(d), hbo_cols, X

def zscore_within_subject(X: np.ndarray):
    scaler = StandardScaler(with_mean=True, with_std=True)
    return scaler.fit_transform(X)

# ----------------------------
# Metrics from decoded states
# ----------------------------
def fractional_occupancy(z: np.ndarray, K: int):
    fo = np.bincount(z, minlength=K).astype(float)
    fo /= max(len(z), 1)
    return fo

def dwell_times_seconds(z: np.ndarray, fs: float, K: int):
    """
    Mean dwell time per state in seconds.
    """
    if len(z) == 0:
        return np.full(K, np.nan)

    dwell_lists = [[] for _ in range(K)]
    run_state = z[0]
    run_len = 1
    for s in z[1:]:
        if s == run_state:
            run_len += 1
        else:
            dwell_lists[run_state].append(run_len)
            run_state = s
            run_len = 1
    dwell_lists[run_state].append(run_len)

    means = []
    for k in range(K):
        if len(dwell_lists[k]) == 0:
            means.append(np.nan)
        else:
            means.append(np.mean(dwell_lists[k]) / fs)
    return np.array(means, dtype=float)

def empirical_transition_matrix(z: np.ndarray, K: int):
    """
    Row-normalized empirical transitions from decoded state sequence z.
    """
    A = np.zeros((K, K), dtype=float)
    if len(z) < 2:
        return A
    for a, b in zip(z[:-1], z[1:]):
        A[a, b] += 1.0
    row_sums = A.sum(axis=1, keepdims=True)
    with np.errstate(divide="ignore", invalid="ignore"):
        A = np.divide(A, row_sums, out=np.zeros_like(A), where=row_sums > 0)
    return A

# ----------------------------
# Hungarian alignment + similarity
# ----------------------------
def cosine_similarity_matrix(M1: np.ndarray, M2: np.ndarray):
    """
    M1: K x D, M2: K x D
    returns K x K similarity matrix
    """
    K1, D1 = M1.shape
    K2, D2 = M2.shape
    assert D1 == D2

    # normalize rows
    M1n = M1 / (norm(M1, axis=1, keepdims=True) + 1e-12)
    M2n = M2 / (norm(M2, axis=1, keepdims=True) + 1e-12)
    return M1n @ M2n.T  # K1 x K2

def hungarian_match_means(means_ref: np.ndarray, means_other: np.ndarray):
    """
    Returns:
      perm: array length K mapping ref_state -> other_state
      avg_sim: average cosine similarity after matching
      sim_mat: KxK similarity matrix
    """
    sim = cosine_similarity_matrix(means_ref, means_other)  # maximize
    cost = -sim
    r, c = linear_sum_assignment(cost)
    perm = c[np.argsort(r)]  # ensure in ref state order
    avg_sim = sim[np.arange(sim.shape[0]), perm].mean()
    return perm, float(avg_sim), sim

# ----------------------------
# HMM fitting per seed
# ----------------------------
def fit_gaussian_hmm(X: np.ndarray, K: int, seed: int, n_iter: int = 300):
    model = GaussianHMM(
        n_components=K,
        covariance_type="diag",
        n_iter=n_iter,
        tol=1e-3,
        random_state=seed,
        verbose=False,
    )
    model.fit(X)
    logL = model.score(X)
    z = model.predict(X)
    return model, float(logL), z

# ----------------------------
# Stability across seeds (within subject)
# ----------------------------
def stability_across_seeds(models_means: list[np.ndarray]):
    """
    Compute average similarity across seed runs by aligning each run to the best run.
    Returns dict with mean/min similarities and the per-run similarities.
    """
    # pick reference as first means (caller will pass best first), or use best externally
    ref = models_means[0]
    sims = []
    for m in models_means[1:]:
        _, avg_sim, _ = hungarian_match_means(ref, m)
        sims.append(avg_sim)
    if len(sims) == 0:
        return {"stability_mean": np.nan, "stability_min": np.nan, "stability_list": []}
    return {
        "stability_mean": float(np.mean(sims)),
        "stability_min": float(np.min(sims)),
        "stability_list": sims,
    }

# ----------------------------
# Baseline A runner
# ----------------------------
def run_baseline_A_individual(
    data_dir: Path,
    pattern: str = "FINS_*c_aligned_timeseries_filtered.csv",
    K: int = 3,
    seeds: list[int] = None,
    fs: float = 10.0,
    min_seconds: float = 45.0,
    n_iter: int = 300,
    output_dir: Path = None
):
    if seeds is None:
        seeds = list(range(10))
    min_T = int(min_seconds * fs)

    csv_paths = sorted(data_dir.glob(pattern))
    if len(csv_paths) == 0:
        raise ValueError(f"No files matched {pattern} in {data_dir}")

    if output_dir is None:
        output_dir = data_dir / "baselineA_outputs"
    output_dir.mkdir(parents=True, exist_ok=True)

    rows_summary = []
    per_subject_state_means = {}  # subject -> best means KxD

    for p in csv_paths:
        subject_id, T, hbo_cols, X = load_postchat_hbo_matrix(p)

        # identify child files if role exists; otherwise run all "*c*" is already child by name
        # (you can add a role check here later if needed)

        if T < min_T:
            rows_summary.append({
                "file": p.name,
                "subject_id": subject_id,
                "T": T,
                "seconds": T / fs,
                "included": False,
                "reason": f"T<{min_T} (<{min_seconds}s)",
            })
            continue

        # standardize within subject
        Xs = zscore_within_subject(X)

        # fit multiple seeds
        seed_results = []
        for s in seeds:
            try:
                model, logL, z = fit_gaussian_hmm(Xs, K=K, seed=s, n_iter=n_iter)
                seed_results.append((s, logL, model, z))
            except Exception as e:
                seed_results.append((s, -np.inf, None, None))

        # pick best run by log-likelihood
        seed_results_sorted = sorted(seed_results, key=lambda x: x[1], reverse=True)
        best_seed, best_logL, best_model, best_z = seed_results_sorted[0]

        if best_model is None or best_z is None or not np.isfinite(best_logL):
            rows_summary.append({
                "file": p.name,
                "subject_id": subject_id,
                "T": T,
                "seconds": T / fs,
                "included": False,
                "reason": "all seeds failed",
            })
            continue

        # metrics for best run
        fo = fractional_occupancy(best_z, K)
        dwell_sec = dwell_times_seconds(best_z, fs=fs, K=K)
        A_emp = empirical_transition_matrix(best_z, K)

        # stability: align each run to the best run's state means
        # gather means for successful runs, with best first
        means_best = best_model.means_
        means_list = [means_best]
        for (s, logL, model, z) in seed_results_sorted[1:]:
            if model is not None and np.isfinite(logL):
                means_list.append(model.means_)
        stab = stability_across_seeds(means_list)

        # record summary row
        row = {
            "file": p.name,
            "subject_id": subject_id,
            "T": T,
            "seconds": T / fs,
            "included": True,
            "K": K,
            "best_seed": best_seed,
            "best_logL": best_logL,
            "logL_mean": float(np.mean([x[1] for x in seed_results if np.isfinite(x[1])])),
            "logL_std": float(np.std([x[1] for x in seed_results if np.isfinite(x[1])], ddof=0)),
            "stability_mean": stab["stability_mean"],
            "stability_min": stab["stability_min"],
        }

        # add FO and dwell per state as separate columns
        for k in range(K):
            row[f"FO_s{k}"] = float(fo[k])
            row[f"dwell_s{k}_sec"] = float(dwell_sec[k]) if np.isfinite(dwell_sec[k]) else np.nan

        rows_summary.append(row)

        # save per-subject artifacts
        subj_key = subject_id if subject_id is not None else p.stem
        per_subject_state_means[subj_key] = means_best

        # transition matrix
        A_df = pd.DataFrame(A_emp, columns=[f"s{j}" for j in range(K)], index=[f"s{i}" for i in range(K)])
        A_df.to_csv(output_dir / f"{subj_key}_K{K}_transitions_empirical.csv")

        # decoded states (optional, useful for later)
        states_df = pd.DataFrame({"state": best_z})
        states_df.to_csv(output_dir / f"{subj_key}_K{K}_decoded_states.csv", index=False)

    # write master summary
    summary_df = pd.DataFrame(rows_summary)
    summary_path = output_dir / f"baselineA_individual_summary_K{K}.csv"
    summary_df.to_csv(summary_path, index=False)

    return summary_df, per_subject_state_means, summary_path

In [13]:
summary_df, means_dict, summary_path = run_baseline_A_individual(
    data_dir=data_dir,
    K=3,
    seeds=list(range(10)),
    fs=10.0,
    min_seconds=45.0,
    n_iter=300
)

print("Wrote:", summary_path)
print(summary_df.head())

Model is not converging.  Current: -4002.447418273866 is not greater than -4002.447416724521. Delta is -1.5493446881009731e-06
Model is not converging.  Current: -4002.4474182738604 is not greater than -4002.447416723286. Delta is -1.5505743249377701e-06
Model is not converging.  Current: -4002.4474182738722 is not greater than -4002.447416723276. Delta is -1.5505961528106127e-06
Model is not converging.  Current: -4002.447418273859 is not greater than -4002.447416723513. Delta is -1.5503460417676251e-06
Model is not converging.  Current: -4002.4474182738627 is not greater than -4002.4474167232643. Delta is -1.5505984265473671e-06
Model is not converging.  Current: -9986.403756858894 is not greater than -9986.40374199816. Delta is -1.486073415435385e-05
Model is not converging.  Current: -9986.403809619793 is not greater than -9986.403759144814. Delta is -5.047497870691586e-05
Model is not converging.  Current: -8892.170466512895 is not greater than -8892.170220609867. Delta is -0.0002

Wrote: ../aligned_timeseries_FINS/baselineA_outputs/baselineA_individual_summary_K3.csv
                                        file subject_id     T  seconds  \
0  FINS_001c_aligned_timeseries_filtered.csv       None     0      0.0   
1  FINS_004c_aligned_timeseries_filtered.csv       004c   671     67.1   
2  FINS_006c_aligned_timeseries_filtered.csv       006c   718     71.8   
3  FINS_007c_aligned_timeseries_filtered.csv       007c   736     73.6   
4  FINS_008c_aligned_timeseries_filtered.csv       008c  1220    122.0   

   included          reason    K  best_seed     best_logL     logL_mean  \
0     False  T<450 (<45.0s)  NaN        NaN           NaN           NaN   
1      True             NaN  3.0        4.0 -14379.065612 -14379.065618   
2      True             NaN  3.0        1.0 -16923.980814 -16937.660198   
3      True             NaN  3.0        0.0 -15386.537234 -15438.796410   
4      True             NaN  3.0        9.0 -29461.374691 -29769.187958   

     logL_std  s

# Debug and Testing

### DEBUG PER FILE post_chat Length

In [38]:
file_a = f"{data_dir}/FINS_040c_aligned_timeseries_filtered.csv"
# # file_a = f"{data_dir}/FINS_056c_aligned_timeseries_filtered.csv"
# # file_b = f"{data_dir}/FINS_057c_aligned_timeseries_filtered.csv"

df_a = pd.read_csv(file_a)
print(pd.unique(df_a['condition']))

# def eda_postchat_hbo(df, name="file"):
#     if "condition" not in df.columns:
#         raise ValueError(f"{name}: no 'condition' column found.")
#     if "time_sec_rel" not in df.columns:
#         raise ValueError(f"{name}: no 'time_sec_rel' column found.")
    
#     print(pd.unique(df['condition']))

#     d = df[df["condition"] == "post_chat"].copy()
#     # d.to_csv('post_chat_only.csv')
#     print("all cols + post chat values", d.shape)
#     hbo_cols = get_hbo_columns(d)
#     if len(hbo_cols) == 0:
#         raise ValueError(f"{name}: no HbO columns found after filtering to post_chat.")
    
#     X = d[hbo_cols].astype(float).values
#     print(X.shape)
    
#     duration_sec = len(d)/10.0
#     duration_min = duration_sec / 60.0
#     # med_dt, hz = estimate_sampling_rate(t)
#     miss = missingness_rate(X)
    
#     print(f"\n=== EDA (post_chat, HbO only): {name} ===")
#     print("Rows (time points):", len(d))
#     print("HbO channels (D):", len(hbo_cols))
#     print(f"Duration: {duration_sec:.2f} sec (~{duration_min:.2f} min)")
#     # print(f"Estimated sampling: dt≈{med_dt:.4f}s  ->  {hz:.2f} Hz")
#     print(f"Missingness rate (NaN fraction): {miss:.6f}")

#     print(f"Total rows in file: {len(df_a)}")
#     print(f"Post_chat rows: {len(d)}")
#     print(f"Percentage: {100 * len(d) / len(df_a):.1f}%")
    
#     return d, hbo_cols, X, d['time_sec']

# a,b,c,d = eda_postchat_hbo(df_a)

['pre_chat' 'easy_talking' 'easy_silence' 'med_talking' 'med_silence'
 'hard_talking' 'hard_silence' 'post_chat']


### Verifying matching dyad existance

In [1]:
import os
import pandas as pd
from pathlib import Path

# Define the directory path
data_dir = Path("../aligned_timeseries_FINS")

# Get all CSV files matching child and parent patterns
child_files = sorted(data_dir.glob("FINS_*c_*_filtered.csv"))
parent_files = sorted(data_dir.glob("FINS_*p_*_filtered.csv"))

# Create dictionaries to store dyad information
child_dyads = {}  # {dyad_number: filename}
parent_dyads = {}  # {dyad_number: filename}

# Extract dyad numbers from filenames
for file_path in child_files:
    # Extract dyad number (e.g., "040" from "FINS_040c_aligned_timeseries_filtered.csv")
    dyad_num = file_path.stem.split('_')[1].replace('c', '')
    child_dyads[dyad_num] = file_path.name

for file_path in parent_files:
    # Extract dyad number (e.g., "040" from "FINS_040p_aligned_timeseries_filtered.csv")
    dyad_num = file_path.stem.split('_')[1].replace('p', '')
    parent_dyads[dyad_num] = file_path.name

# Find dyads that have both child and parent files
matched_dyads = set(child_dyads.keys()).intersection(set(parent_dyads.keys()))
# print(sorted(matched_dyads))

# Lists to store files with post_chat (for matched dyads only)
child_files_with_post_chat = []
parent_files_with_post_chat = []
matched_pairs = []

# Check matched dyads for post_chat condition
for dyad_num in sorted(matched_dyads):
    child_file = data_dir / child_dyads[dyad_num]
    parent_file = data_dir / parent_dyads[dyad_num]
    
    child_has_post_chat = False
    parent_has_post_chat = False
    child_dyad_id = None
    parent_dyad_id = None
    
    try:
        # Check child file
        df_child = pd.read_csv(child_file)
        if 'condition' in df_child.columns and 'post_chat' in df_child['condition'].values:
            child_has_post_chat = True
            child_files_with_post_chat.append(child_dyads[dyad_num])
            if 'dyad_id' in df_child.columns:
                child_dyad_id = df_child['dyad_id'].iloc[0]
        
        # Check parent file
        df_parent = pd.read_csv(parent_file)
        if 'condition' in df_parent.columns and 'post_chat' in df_parent['condition'].values:
            parent_has_post_chat = True
            parent_files_with_post_chat.append(parent_dyads[dyad_num])
            if 'dyad_id' in df_parent.columns:
                parent_dyad_id = df_parent['dyad_id'].iloc[0]
        
        # Verify dyad_id matches
        dyad_id_match = (child_dyad_id == parent_dyad_id) if (child_dyad_id and parent_dyad_id) else "N/A"
        
        # If both have post_chat, add to matched pairs
        if child_has_post_chat and parent_has_post_chat:
            matched_pairs.append({
                'dyad_num': dyad_num,
                'child_file': child_dyads[dyad_num],
                'parent_file': parent_dyads[dyad_num],
                'dyad_id_match': dyad_id_match,
                'dyad_id': child_dyad_id if child_dyad_id else parent_dyad_id
            })
    
    except Exception as e:
        print(f"Error reading dyad {dyad_num}: {e}")

# Print results
print(f"Total child files: {len(child_files)}")
print(f"Total parent files: {len(parent_files)}")
print(f"Matched dyads (have both child & parent): {len(matched_dyads)}")
print(f"\nChild files with 'post_chat': {len(child_files_with_post_chat)}")
print(f"Parent files with 'post_chat': {len(parent_files_with_post_chat)}")
print(f"Complete pairs with 'post_chat' in both: {len(matched_pairs)}")

print("\n=== Matched pairs with post_chat ===")
for pair in matched_pairs:
    print(f"\nDyad {pair['dyad_num']} (dyad_id: {pair['dyad_id']}, match: {pair['dyad_id_match']})")
    print(f"  Child:  {pair['child_file']}")
    print(f"  Parent: {pair['parent_file']}")
    break

Total child files: 44
Total parent files: 44
Matched dyads (have both child & parent): 44

Child files with 'post_chat': 41
Parent files with 'post_chat': 41
Complete pairs with 'post_chat' in both: 41

=== Matched pairs with post_chat ===

Dyad 004 (dyad_id: 4, match: True)
  Child:  FINS_004c_aligned_timeseries_filtered.csv
  Parent: FINS_004p_aligned_timeseries_filtered.csv


### Finding condition lenghts for each child

In [45]:
data_dir = Path("../aligned_timeseries_FINS")

# Get all CSV files matching the pattern
matching_files = [f for f in data_dir.glob("FINS_*c_*_filtered.csv")]

# List to store files with post_chat
files_with_post_chat = []

# Check each file for post_chat in condition column
for file_path in matching_files:
    try:
        df = pd.read_csv(file_path)
        if 'condition' in df.columns:
            if 'post_chat' in df['condition'].values:
                pc_df = df[df['condition'] == 'post_chat']
                duration = len(pc_df)/10.0
                print(f"{file_path.name.split('_')[1]}: {duration} secs {len(pc_df)} rows")
    except Exception as e:
        print(f"Error reading {file_path.name}: {e}")

022c: 79.7 secs 797 rows
053c: 76.8 secs 768 rows
015c: 74.0 secs 740 rows
011c: 122.0 secs 1220 rows
057c: 68.8 secs 688 rows
039c: 72.6 secs 726 rows
048c: 61.0 secs 610 rows
019c: 122.0 secs 1220 rows
006c: 71.8 secs 718 rows
040c: 61.0 secs 610 rows
044c: 65.4 secs 654 rows
043c: 61.0 secs 610 rows
032c: 80.7 secs 807 rows
029c: 11.1 secs 111 rows
036c: 78.7 secs 787 rows
021c: 61.0 secs 610 rows
016c: 61.0 secs 610 rows
050c: 66.6 secs 666 rows
009c: 94.7 secs 947 rows
025c: 80.1 secs 801 rows
054c: 68.3 secs 683 rows
045c: 67.0 secs 670 rows
018c: 61.0 secs 610 rows
007c: 73.6 secs 736 rows
056c: 79.9 secs 799 rows
010c: 61.0 secs 610 rows
027c: 72.7 secs 727 rows
049c: 69.4 secs 694 rows
038c: 75.8 secs 758 rows
014c: 122.0 secs 1220 rows
052c: 32.8 secs 328 rows
023c: 73.6 secs 736 rows
013c: 122.0 secs 1220 rows
055c: 70.3 secs 703 rows
024c: 69.4 secs 694 rows
051c: 65.9 secs 659 rows
017c: 71.2 secs 712 rows
008c: 122.0 secs 1220 rows
037c: 72.8 secs 728 rows
033c: 84.2 secs

In [55]:
df = pd.read_csv('/Users/ineshtandon/Documents/GitHub/REN_fNIRS_State_Analysis/aligned_timeseries_FINS/FINS_029c_aligned_timeseries_filtered.csv')
print(len(df))
pd.unique(df['condition'])

4295


array(['pre_chat', 'easy_talking', 'easy_silence', 'med_talking',
       'med_silence', 'hard_talking', 'hard_silence', 'post_chat'],
      dtype=object)