# Analyze dEchorate Metadata (CSV)
Load the CSV metadata, filter by room/source/signal, and prepare outputs for the HDF5 notebook.


In [74]:
# 0) Imports
from pathlib import Path
import numpy as np
import pandas as pd


## 1) Configuration


In [75]:
# 1) User config (EDIT THESE)
METADATA_PATH = Path("../../data/dEchorate/raw/dEchorate_database.csv")
PROCESSED_DIR = Path("../../data/dEchorate/processed")

# Shared settings used by the HDF5 notebook cache
H5_PATH = Path("../../data/dEchorate/raw/dEchorate_rirs_gzip7.hdf5")


In [76]:
# 2) Helpers: metadata loading + light validation


def load_metadata(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Metadata file not found: {path.resolve()}")
    if path.suffix.lower() != ".csv":
        raise ValueError(
            f"Unsupported metadata file type: {path.suffix}. Expected .csv"
        )

    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = [str(c).strip() for c in df.columns]

    drop_cols = [c for c in df.columns if c == "" or str(c).startswith("Unnamed")]
    if drop_cols:
        df = df.drop(columns=drop_cols)

    print(f"Loaded metadata: {path.name} | rows={len(df):,} cols={len(df.columns)}")
    return df

## 2) Load and Inspect


In [77]:
# Load metadata
metadata_df = load_metadata(METADATA_PATH)
print(
    "Metadata columns (first 30): "
    + ", ".join(map(str, list(metadata_df.columns)[:30]))
)
print("Preview:")
display(metadata_df.head(5))


Loaded metadata: dEchorate_database.csv | rows=10,912 cols=41
Metadata columns (first 30): filename, src_id, src_ch, src_type, src_signal, src_pos_x, src_pos_y, src_pos_z, room_code, room_rfl_floor, room_rfl_ceiling, room_rfl_west, room_rfl_south, room_rfl_east, room_rfl_north, room_fornitures, room_temperature, rec_silence_dB, rec_artifacts, mic_type, mic_id, mic_ch, mic_pos_x, mic_pos_y, mic_pos_z, array_id, array_bar_x, array_bar_y, array_bar_z, array_bar_pos_x
Preview:


Unnamed: 0,filename,src_id,src_ch,src_type,src_signal,src_pos_x,src_pos_y,src_pos_z,room_code,room_rfl_floor,...,array_bar_pos_z,array_bar_view_x,array_bar_view_y,array_bar_view_z,mic_view_x,mic_view_y,mic_view_z,src_view_x,src_view_y,src_view_z
0,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
1,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
2,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
3,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
4,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,


## 3) Filter Metadata


In [78]:
# 3) Filter metadata
ROOM_CODES = [0, 1, 2]
SRC_IDS = [0, 1, 2]
SIGNAL_NAMES = ["chirp"]

required_cols = [
    "room_code",
    "src_id",
    "src_signal",
    "mic_id",
    "mic_pos_x",
    "mic_pos_y",
    "mic_pos_z",
]
missing = [c for c in required_cols if c not in metadata_df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

subset_df = metadata_df.query(
    "room_code in @ROOM_CODES and src_id in @SRC_IDS and "
    "src_signal.str.lower() in @SIGNAL_NAMES"
).copy()

print(
    f"Filtered case -> rooms={ROOM_CODES}, src={SRC_IDS}, sig={SIGNAL_NAMES} "
    f"| rows={len(subset_df)}"
)


needed_cols = [
    "room_code",
    "src_id",
    "src_signal",
    "mic_id",
    "mic_pos_x",
    "mic_pos_y",
    "mic_pos_z",
    "room_rfl_west",
    "room_rfl_east",
    "room_rfl_north",
    "room_rfl_south",
    "room_rfl_ceiling",
    "room_rfl_floor",
]

subset_df = subset_df[needed_cols].reset_index(drop=True)
display(subset_df.head(5))


Filtered case -> rooms=[0, 1, 2], src=[0, 1, 2], sig=['chirp'] | rows=186


Unnamed: 0,room_code,src_id,src_signal,mic_id,mic_pos_x,mic_pos_y,mic_pos_z,room_rfl_west,room_rfl_east,room_rfl_north,room_rfl_south,room_rfl_ceiling,room_rfl_floor
0,0.0,0.0,chirp,0.0,0.80316092,383.141.445,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,chirp,1.0,0.8406819,384.527.719,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,chirp,2.0,0.88758314,386.260.561,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,chirp,3.0,0.94855474,388.513.257,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,chirp,4.0,10.423.572,391.978.942,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0


## 4) Save Filtered CSV


In [79]:
# 3.1) Save filtered metadata for reuse
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
FILTERED_PATH = PROCESSED_DIR / "dEchorate_metadata_filtered.csv"
subset_df.to_csv(FILTERED_PATH, index=False)
print(f"Saved filtered metadata: {FILTERED_PATH}")


Saved filtered metadata: ../../data/dEchorate/processed/dEchorate_metadata_filtered.csv


## 5) Prepare HDF5 Inputs


In [None]:
# 5) Build mic indices and design matrix for HDF5 extraction
# Clean numeric columns before saving/using
from unittest import case


cols = ["mic_id", "mic_pos_x", "mic_pos_y", "mic_pos_z"]


clean_df = subset_df.copy()

# convert mic_col to numeric
clean_df[cols] = pd.to_numeric(clean_df[cols])

# if clean_df[pos_cols].isna().any().any():
#     clean_df[pos_cols] = clean_df[pos_cols].fillna(clean_df[pos_cols].median())

clean_df = (
    clean_df.dropna().drop_duplicates().sort_values(by="mic_col").reset_index(drop=True)
)


print(f"Cleaned rows: {len(clean_df):,} (dropped {len(subset_df)-len(clean_df):,})")


ValueError: Unable to parse string "10.423.572" at position 4

In [None]:
display(case.head(5))

Unnamed: 0,room_code,src_id,src_signal,mic_id,mic_pos_x,mic_pos_y,mic_pos_z,room_rfl_west,room_rfl_east,room_rfl_north,room_rfl_south,room_rfl_ceiling,room_rfl_floor
0,0.0,0.0,chirp,0,0.803161,175.5075,0.968617,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,chirp,1,0.840682,175.5075,0.968617,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,chirp,2,0.887583,175.5075,0.968617,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,chirp,3,0.948555,175.5075,0.968617,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,chirp,4,0.847024,175.5075,0.968617,0.0,0.0,0.0,0.0,0.0,0.0


## 6) Cache for HDF5 Notebook


In [None]:
# 5) Save cache for the HDF5 notebook
np.savez(
    "dechorate_metadata_cache.npz",
    mic_indices=mic_indices,
    use_positions=use_positions,
    X_design=X_design,
    ROOM_CODE=ROOM_CODE,
    SRC_ID=SRC_ID,
    SIGNAL_NAME=SIGNAL_NAME,
    ROOM_DIMS=ROOM_DIMS,
    ID_BASE_OFFSET=ID_BASE_OFFSET,
)
print("Saved cache: dechorate_metadata_cache.npz")


Saved cache: dechorate_metadata_cache.npz
