In [None]:
#before the MANOVA, check univariate normality

from scipy.stats import shapiro

for col in response_cols:
    stat, p = shapiro(df[col])
    print(f"{col}: W={stat:.3f}, p={p:.3g}")

In [None]:
# then execute the MANOVA across the two ROIs
# this should output stats for both amygdala and MD

import pandas as pd
import re
from statsmodels.multivariate.manova import MANOVA

# 1. Read in
df = pd.read_csv("all_data_for_LDA.csv")

# 2. Make sure label_e is categorical
df['label_e'] = df['label_e'].astype('category')

# 3. Sanitize all column names
df.columns = [re.sub(r'\W+', '_', col) for col in df.columns]

# 4. Define response variables, excluding label_e and movie_index
response_cols = [c for c in df.columns if c not in ('label_e', 'movie_index')]

def run_manova(df_subset, levels, name):
    # a) Drop rows with any missing in responses or in label_e
    df_clean = df_subset.dropna(subset=response_cols + ['label_e']).copy()
    
    # b) Remove unused categories so dummy‐coding only sees your two levels
    df_clean['label_e'] = df_clean['label_e'].cat.remove_unused_categories()
    
    # c) Show how many cases per level
    counts = df_clean['label_e'].value_counts()
    print(f"\n{name} subset counts after cleaning:\n{counts}\n")
    if len(counts) < 2:
        raise ValueError(f"Only one level present in {name} after cleaning—cannot run MANOVA.")
    
    # d) Build formula & fit
    formula = ' + '.join(response_cols) + ' ~ label_e'
    print(f"=== MANOVA for {name} ({levels[0]} vs {levels[1]}) ===")
    maov = MANOVA.from_formula(formula, data=df_clean)
    print(maov.mv_test())

# 5. AM_high vs AM_low
am_levels = ['AM_high', 'AM_low']
df_am = df[df['label_e'].isin(am_levels)]
run_manova(df_am, am_levels, "AM")

# 6. MD_high vs MD_low
md_levels = ['MD_high', 'MD_low']
df_md = df[df['label_e'].isin(md_levels)]
run_manova(df_md, md_levels, "MD")
