In [3]:
import os
import pandas as pd
from glob import glob
import numpy as np
from collections import defaultdict

# Base path for raw annotation CSVs
annotation_base = "/Volumes/conflab-mm/v4/release/annotations/actions/laughing"
# Output directory for consolidated CSVs
output_dir = "./annotations_csvs"
os.makedirs(output_dir, exist_ok=True)

def load_annotation_files(modality):
    """Return sorted list of all CSV paths for a given modality."""
    folder = os.path.join(annotation_base, modality)
    return sorted(glob(os.path.join(folder, "*.csv")))

def consolidate_modality(modality, participant_ids=range(1,51)):
    """
    For a given modality, read all annotator CSVs (with robust fallback),
    apply Union Voting per participant & frame, and write out a single consolidated CSV.
    """
    seg_ann = defaultdict(list)
    for path in load_annotation_files(modality):
        fn = os.path.basename(path).replace('.csv','')
        vid, seg, ann = fn.split('_')
        # robust CSV load
        try:
            df = pd.read_csv(path, encoding='latin1', on_bad_lines='skip', engine='python')
        except Exception:
            # fallback to manual parse
            with open(path, 'r', encoding='latin1', errors='ignore') as f:
                lines = f.readlines()
            header = lines[0].strip().split(',')
            data = [line.strip().split(',') for line in lines[1:] if ',' in line]
            df = pd.DataFrame(data, columns=header)
        # coerce to int 0/1
        df = df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
        seg_ann[(vid, seg)].append(df)

    records = []
    for (vid, seg), dfs in seg_ann.items():
        max_len = max(len(df) for df in dfs)
        for pid in participant_ids:
            col = str(pid)
            votes = []
            for df in dfs:
                if col in df.columns:
                    v = df[col].values
                else:
                    v = np.zeros(len(df), dtype=int)
                votes.append(v)
            padded = [np.pad(v, (0, max_len - len(v)), constant_values=0) for v in votes]
            union = np.logical_or.reduce(padded).astype(int)
            for frame, label in enumerate(union):
                records.append({
                    "modality": modality,
                    "video_id": vid,
                    "segment": seg,
                    "participant": pid,
                    "frame": frame,
                    "label": int(label)
                })

    df_out = pd.DataFrame(records)
    csv_path = os.path.join(output_dir, f"annotations_{modality.lower()}.csv")
    df_out.to_csv(csv_path, index=False)
    print(f"✅ Saved {csv_path} — {len(df_out)} rows")


In [4]:

# Run for all three modalities
for modality in ["No_Audio", "Only_Audio", "With_Audio"]:
    consolidate_modality(modality)


✅ Saved ./annotations_csvs/annotations_no_audio.csv — 8083500 rows
✅ Saved ./annotations_csvs/annotations_only_audio.csv — 8141250 rows
✅ Saved ./annotations_csvs/annotations_with_audio.csv — 8145450 rows
