# Build Manifest

In [1]:
import os
import glob
import re
import json
from collections import Counter
import pandas as pd

In [2]:
# --- Paths ---
POS_BASE = "positive_examples_anonymous_chunks"
NEG_BASE = "negative_examples_anonymous_chunks"

# Using regex to extract subject and chunk number
SUBJECT_RE = re.compile(r"subject(\d+)", re.IGNORECASE)
CHUNK_RE   = re.compile(r"_(\d+)$")   # trailing _<chunkid>

In [3]:
def parse_name(filename):
    """
    Parse filenames like 'train_subject7488_1.xml'
    Returns:
        tuple: (subject_id, chunk_id)
    """
    basename_no_ext = os.path.splitext(filename)[0]  # e.g. 'train_subject7488_1'
    parts = basename_no_ext.split("_")               # ['train', 'subject7488', '1']

    subject_id = parts[1]   # 'subject7488'
    chunk_id = int(parts[2])  # '1' -> 1

    return subject_id, chunk_id

In [4]:
def build_manifest(base_folder, label):
    """
    Build a manifest DataFrame:
    - Each row = one subject
    - Columns: subject_id, chunks (list of file paths), label
    """
    manifest = {}

    # find ALL xml files recursively
    pattern = os.path.join(base_folder, "**", "*.xml")
    all_files = glob.glob(pattern, recursive=True)

    for filepath in all_files:
        filename = os.path.basename(filepath)
        subject_id, chunk_id = parse_name(filename)

        if subject_id not in manifest:
            manifest[subject_id] = []
        manifest[subject_id].append(filepath)

    # build DataFrame
    df = pd.DataFrame([
        {"subject_id": subject, "chunks": sorted(files), "label": label}
        for subject, files in manifest.items()
    ])

    return df


In [5]:
df_pos = build_manifest(POS_BASE, "positive")
df_neg = build_manifest(NEG_BASE, "negative")

df_all = pd.concat([df_pos, df_neg], ignore_index=True)

print(df_all.head())
print(df_all.info())



    subject_id                                             chunks     label
0  subject6760  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
1   subject127  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
2  subject7326  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
3  subject2712  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
4  subject2252  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  486 non-null    object
 1   chunks      486 non-null    object
 2   label       486 non-null    object
dtypes: object(3)
memory usage: 11.5+ KB
None
