# Analyze dEchorate Metadata (CSV)
Load the CSV metadata, filter by room/source/signal, and prepare outputs for the HDF5 notebook.


In [129]:
# 0) Imports
from pathlib import Path
import numpy as np
import pandas as pd


## 1) Configuration


In [130]:
# 1) User config
METADATA_PATH = Path("../../data/dEchorate/raw/dEchorate_database.csv")
PROCESSED_METADATA_PATH = Path("../../data/dEchorate/processed/dEchorate_database_cleaned.csv")


In [131]:
# 2) Helpers: metadata loading + light validation


def load_metadata(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Metadata file not found: {path.resolve()}")

    if path.suffix.lower() != ".csv":
        raise ValueError(
            f"Unsupported metadata file type: {path.suffix}. Expected .csv"
        )

    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = [str(c).strip() for c in df.columns]

    drop_cols = [c for c in df.columns if c == "" or str(c).startswith("Unnamed")]
    if drop_cols:
        df = df.drop(columns=drop_cols)

    print(f"Loaded metadata: {path.name} | rows={len(df):,} cols={len(df.columns)}")
    return df

## 2) Load and Inspect


In [132]:
# Load metadata
metadata_df = load_metadata(METADATA_PATH)
print(
    "Metadata columns (first 30): "
    + ", ".join(map(str, list(metadata_df.columns)[:30]))
)
print("Preview:")
display(metadata_df.head(5))


Loaded metadata: dEchorate_database.csv | rows=10,912 cols=41
Metadata columns (first 30): filename, src_id, src_ch, src_type, src_signal, src_pos_x, src_pos_y, src_pos_z, room_code, room_rfl_floor, room_rfl_ceiling, room_rfl_west, room_rfl_south, room_rfl_east, room_rfl_north, room_fornitures, room_temperature, rec_silence_dB, rec_artifacts, mic_type, mic_id, mic_ch, mic_pos_x, mic_pos_y, mic_pos_z, array_id, array_bar_x, array_bar_y, array_bar_z, array_bar_pos_x
Preview:


Unnamed: 0,filename,src_id,src_ch,src_type,src_signal,src_pos_x,src_pos_y,src_pos_z,room_code,room_rfl_floor,...,array_bar_pos_z,array_bar_view_x,array_bar_view_y,array_bar_view_z,mic_view_x,mic_view_y,mic_view_z,src_view_x,src_view_y,src_view_z
0,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
1,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
2,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
3,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,
4,2020-01-22__22-48-02,99.0,99.0,silence,silence,10000,0.0,1.0,0.0,0.0,...,0.0,,,,,,,,,


## 3) Filter Metadata


In [133]:
# 3) Filter metadata
ROOM_CODES = [0, 1, 2]
SRC_IDS = [0, 1, 2]
SIGNAL_NAMES = ["chirp"]

required_cols = [
    "room_code",
    "src_id",
    "src_signal",
    "mic_id",
    "mic_pos_x",
    "mic_pos_y",
    "mic_pos_z",
]
missing = [c for c in required_cols if c not in metadata_df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

subset_df = metadata_df.query(
    "room_code in @ROOM_CODES and src_id in @SRC_IDS and "
    "src_signal.str.lower() in @SIGNAL_NAMES"
).copy()

print(
    f"Filtered case -> rooms={ROOM_CODES}, src={SRC_IDS}, sig={SIGNAL_NAMES} "
    f"| rows={len(subset_df)}"
)


needed_cols = [
    "room_code",
    "src_id",
    "src_signal",
    "mic_id",
    "mic_pos_x",
    "mic_pos_y",
    "mic_pos_z",
    "room_rfl_west",
    "room_rfl_east",
    "room_rfl_north",
    "room_rfl_south",
    "room_rfl_ceiling",
    "room_rfl_floor",
]

subset_df = subset_df[needed_cols].reset_index(drop=True)
subset_df


Filtered case -> rooms=[0, 1, 2], src=[0, 1, 2], sig=['chirp'] | rows=186


Unnamed: 0,room_code,src_id,src_signal,mic_id,mic_pos_x,mic_pos_y,mic_pos_z,room_rfl_west,room_rfl_east,room_rfl_north,room_rfl_south,room_rfl_ceiling,room_rfl_floor
0,0.0,0.0,chirp,0.0,0.80316092,383.141.445,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,chirp,1.0,0.8406819,384.527.719,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,chirp,2.0,0.88758314,386.260.561,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,chirp,3.0,0.94855474,388.513.257,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,chirp,4.0,10.423.572,391.978.942,104.391.528,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,1.0,2.0,chirp,26.0,304.939.035,344.406.817,149.048.013,0.0,0.0,1.0,0.0,0.0,0.0
182,1.0,2.0,chirp,27.0,300.846.949,34.727.994,149.048.013,0.0,0.0,1.0,0.0,0.0,0.0
183,1.0,2.0,chirp,28.0,295.527.238,351.015,149.048.013,0.0,0.0,1.0,0.0,0.0,0.0
184,1.0,2.0,chirp,29.0,287.343.067,356.761.246,149.048.013,0.0,0.0,1.0,0.0,0.0,0.0


## 4) Prepare HDF5 Inputs


In [134]:
# 5) Build mic indices and design matrix for HDF5 extraction
# Clean numeric columns before saving/using


import re


clean_df = subset_df.copy()


def parse_fixed_8(v):
    s = str(v).strip()

    # if there are commas, normalize them (some exports use comma decimals)
    s = s.replace(",", ".")

    # keep only digits (remove all dots, spaces, etc.)
    digits = re.sub(r"[^0-9]", "", s)
    if digits == "":
        return np.nan

    # interpret as fixed-point with 8 decimal digits
    return int(digits) / 1e8


for c in ["mic_pos_x", "mic_pos_y", "mic_pos_z"]:
    clean_df[c] = clean_df[c].apply(parse_fixed_8).astype("float32")

clean_df[["mic_id", "room_rfl_south"]] = (
    clean_df[["mic_id", "room_rfl_south"]].apply(pd.to_numeric).astype("float32")
)

clean_df = (
    clean_df.dropna().drop_duplicates().sort_values(by="mic_id").reset_index(drop=True)
)


print(f"Cleaned rows: {len(clean_df):,} (dropped {len(subset_df)-len(clean_df):,})")


Cleaned rows: 180 (dropped 6)


In [135]:
display(clean_df.head(5))

Unnamed: 0,room_code,src_id,src_signal,mic_id,mic_pos_x,mic_pos_y,mic_pos_z,room_rfl_west,room_rfl_east,room_rfl_north,room_rfl_south,room_rfl_ceiling,room_rfl_floor
0,0.0,0.0,chirp,0.0,0.803161,3.831414,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,chirp,0.0,0.803161,3.831414,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,chirp,0.0,0.803161,3.831414,1.043915,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,chirp,0.0,0.803161,3.831414,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,chirp,0.0,0.803161,3.831414,1.043915,0.0,0.0,1.0,0.0,0.0,0.0


## 6) Save cleaned version of data


In [136]:
# 5) Save cleaned data for downstream notebooks

clean_df.to_csv(PROCESSED_METADATA_PATH, index=False)
print(f"Saved cleaned data: {PROCESSED_METADATA_PATH}")

Saved cleaned data: ../../data/dEchorate/processed/dEchorate_database_cleaned.csv
