In [16]:
# 0) Imports
from pathlib import Path
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd


# 1) Configuration

In [17]:
# 1) User config (EDIT THESE)
METADATA_PATH = Path("../../data/dEchorate/raw/dEchorate_database.csv")
PROCESSED_METADATA_PATH = Path("../../data/dEchorate/processed/dEchorate_database_cleaned.csv")

# Shared settings used by the HDF5 notebook cache
H5_PATH = Path("../../data/dEchorate/raw/dEchorate_rirs_gzip7.hdf5")


# 2) Read metadata and Inspect

In [18]:
metadata_df = pd.read_csv(PROCESSED_METADATA_PATH)
print(f"Loaded cleaned metadata from {PROCESSED_METADATA_PATH}")
metadata_df.head()

Loaded cleaned metadata from ../../data/dEchorate/processed/dEchorate_database_cleaned.csv


Unnamed: 0,room_code,src_id,src_signal,mic_id,mic_pos_x,mic_pos_y,mic_pos_z,room_rfl_west,room_rfl_east,room_rfl_north,room_rfl_south,room_rfl_ceiling,room_rfl_floor
0,0.0,0.0,chirp,0.0,0.803161,3.831415,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,chirp,0.0,0.803161,3.831415,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,chirp,0.0,0.803161,3.831415,1.043915,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,chirp,0.0,0.803161,3.831415,1.043915,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,chirp,0.0,0.803161,3.831415,1.043915,0.0,0.0,1.0,0.0,0.0,0.0


## Metadata selection
Select room/source/signal and build mic indices + positions for HDF5 lookup.


In [None]:
# 3) Metadata selection + design matrix
T0 = 4096

# mic-position normalization constants (approx room size used in dEchorate)
ROOM_DIMS = np.array([6.0, 6.0, 2.4], dtype=np.float32)
ID_BASE_OFFSET = 0  # typical is 0, but some datasets use 1

# Select a single room/src/signal for extraction
ROOM_CODE = 0
SRC_ID = 0
SIGNAL_NAME = "chirp"

required_cols = [
    "room_code",
    "src_id",
    "src_signal",
    "mic_id",
    "mic_pos_x",
    "mic_pos_y",
    "mic_pos_z",
]
missing = [c for c in required_cols if c not in metadata_df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

subset_df = metadata_df.query(
    "room_code == @ROOM_CODE and src_id == @SRC_ID and "
    "src_signal.str.lower() == @SIGNAL_NAME.lower()"
).copy()

mic_meta = (
    subset_df.dropna(subset=["mic_id", "mic_pos_x", "mic_pos_y", "mic_pos_z"])
    .drop_duplicates(subset=["mic_id"])
    .sort_values("mic_id")
)

mic_indices = mic_meta["mic_id"].astype(int).to_numpy()
use_positions = True
X_design = (
    mic_meta[["mic_pos_x", "mic_pos_y", "mic_pos_z"]].to_numpy().astype(np.float32)
)

print(
    f"Filtered rows: {len(subset_df)} | mics: {len(mic_indices)} | "
    f"ROOM_CODE={ROOM_CODE} SRC_ID={SRC_ID} SIGNAL_NAME={SIGNAL_NAME}"
)


KeyError: "Missing required variables: mic_indices, use_positions, X_design, ROOM_CODE, SRC_ID, SIGNAL_NAME. Run `notebooks/dechorte_analysis/01_metadata_csv_analysis.ipynb` and optionally save a cache: np.savez('dechorate_metadata_cache.npz', ...)."

## Explore HDF5 contents
List dataset candidates and infer axes.


In [None]:
# 4) Quick HDF5 overview (datasets + shapes)
with h5py.File(H5_PATH, "r") as f:
    print(f"Top-level keys: {list(f.keys())}")
    datasets = []

    def _collect(name, obj):
        if isinstance(obj, h5py.Dataset):
            datasets.append({"path": name, "shape": obj.shape, "dtype": str(obj.dtype)})

    f.visititems(_collect)
    if datasets:
        ds_df = pd.DataFrame(datasets).sort_values("path")
        print(f"Found {len(ds_df)} datasets")
        print(ds_df.head(20).to_string(index=False))
        first_path = ds_df.iloc[0]["path"]
        sample_ds = f[first_path]
        sample = sample_ds[tuple(slice(0, 1) for _ in range(sample_ds.ndim))]
        print(f"Sample from {first_path}: shape={sample.shape}")
    else:
        print("No datasets found in HDF5 file.")


In [10]:
# 4) HDF5: inspect keys, find RIR dataset, infer axis order
def list_h5_tree(h5obj, prefix=""):
    """Recursively list groups/datasets (short)."""
    for k in h5obj.keys():
        item = h5obj[k]
        if isinstance(item, h5py.Dataset):
            print(f"H5 dataset: {prefix}{k} shape={item.shape} dtype={item.dtype}")
        else:
            print(f"H5 group:   {prefix}{k}/")
            list_h5_tree(item, prefix=prefix + k + "/")


def find_candidate_rir_datasets(h5obj):
    """Return list of dataset paths that look like RIR tensors (>=3 dims)."""
    cands = []

    def _walk(obj, path=""):
        for k in obj.keys():
            item = obj[k]
            p = f"{path}/{k}" if path else k
            if isinstance(item, h5py.Dataset):
                name = k.lower()
                if ("rir" in name or "rirs" in name or "ir" == name) and item.ndim >= 3:
                    cands.append(p)
            else:
                _walk(item, p)

    _walk(h5obj)
    return cands


def infer_axes(shape, expected_mics=(30,), expected_srcs=(6,), expected_rooms=(11,)):
    """
    Heuristic inference: identify axes by dimension sizes.
    Returns dict with keys: time, mic, src, room
    """
    shape = list(shape)
    # time axis = largest dimension (RIR length)
    time_ax = int(np.argmax(shape))

    remaining = [i for i in range(len(shape)) if i != time_ax]

    # helper: pick axis whose size matches any of expected sizes
    def pick_axis(expected_sizes):
        for i in remaining:
            if shape[i] in expected_sizes:
                return i
        return None

    mic_ax = pick_axis(expected_mics)
    if mic_ax is not None:
        remaining.remove(mic_ax)

    src_ax = pick_axis(expected_srcs)
    if src_ax is not None:
        remaining.remove(src_ax)

    room_ax = pick_axis(expected_rooms)
    if room_ax is not None:
        remaining.remove(room_ax)

    return {"time": time_ax, "mic": mic_ax, "src": src_ax, "room": room_ax}


def extract_rir_segment(rirs_ds, axes, room_idx, src_idx, mic_idx, T0):
    """Slice a single RIR and return first T0 samples as 1D float32."""
    sl = [slice(None)] * rirs_ds.ndim
    if axes["room"] is not None:
        sl[axes["room"]] = int(room_idx)
    if axes["src"] is not None:
        sl[axes["src"]] = int(src_idx)
    if axes["mic"] is not None:
        sl[axes["mic"]] = int(mic_idx)

    x = rirs_ds[tuple(sl)]
    # Move time axis to front, flatten any leftovers
    x = np.moveaxis(x, axes["time"], 0)
    x = np.asarray(x).reshape(x.shape[0], -1)
    x = x[:, 0]  # pick first if extra singleton dims remain
    x = x[:T0].astype(np.float32, copy=False)
    return x


## Load HDF5 and extract early RIRs
Pick a dataset, infer axes, and build `Y_echo` for all mics in the subset.


In [11]:
# 4) Open HDF5, find RIR dataset, infer axes
with h5py.File(H5_PATH, "r") as f:
    print(f"Opened HDF5: {H5_PATH.name}")
    # optional: uncomment if you want a full tree log
    # list_h5_tree(f)

    candidates = find_candidate_rir_datasets(f)
    if not candidates:
        # fallback: list top-level to help debugging
        print("No obvious RIR datasets found (by name). Top-level keys are:")
        print(list(f.keys()))
        raise KeyError(
            "Could not auto-detect RIR dataset. Inspect keys and set it manually."
        )

    print("Candidate RIR datasets:")
    for p in candidates:
        ds = f[p]
        print(f"  - {p} shape={ds.shape} dtype={ds.dtype}")

    # Choose the first candidate by default
    RIR_PATH = candidates[0]
    rirs_ds = f[RIR_PATH]
    print(f"Selected RIR dataset: {RIR_PATH} | shape={rirs_ds.shape}")

    axes = infer_axes(
        rirs_ds.shape, expected_mics=(30,), expected_srcs=(6,), expected_rooms=(11,)
    )
    print(f"Inferred axes: {axes} (None means 'not detected')")

    # If src/room axes weren't detected, we can still try slicing by assuming axis order,
    # but simplest is to print shape and set axes manually.
    if axes["mic"] is None or axes["src"] is None or axes["room"] is None:
        print("Could not infer all axes reliably.")
        print("Print rirs_ds.shape and set axes manually if needed.")
        print(f"rirs_ds.shape={rirs_ds.shape}")

    # 5) Extract Y_echo: early RIR segments for each mic
    room_idx = ROOM_CODE - ID_BASE_OFFSET
    src_idx = SRC_ID - ID_BASE_OFFSET

    Y_list = []
    for m in mic_indices:
        seg = extract_rir_segment(
            rirs_ds, axes, room_idx=room_idx, src_idx=src_idx, mic_idx=m, T0=T0
        )
        Y_list.append(seg)

    Y_echo = np.stack(Y_list, axis=0)  # (N, T0)
    print(f"Extracted Y_echo (raw) | shape={Y_echo.shape} dtype={Y_echo.dtype}")


Opened HDF5: dEchorate_rirs_gzip7.hdf5


RuntimeError: Unable to get group info (addr overflow, addr = 2536, size = 328, eoa = 2048)

## Inspect raw RIRs
Quick waveform plots and basic statistics.


In [None]:
print("Plotting a few raw RIR segments...")
plt.figure(figsize=(10, 4))
for i in range(min(5, Y_echo.shape[0])):
    plt.plot(Y_echo[i], alpha=0.8)
plt.title("Raw RIR segments (first few mics)")
plt.xlabel("Sample")
plt.ylabel("Amplitude")
plt.tight_layout()
plt.show()

print("Y_echo summary:")
print(f"  shape: {Y_echo.shape}")
print(f"  min/max: {Y_echo.min():.4g} / {Y_echo.max():.4g}")
print(f"  mean/std: {Y_echo.mean():.4g} / {Y_echo.std():.4g}")


## Normalize Y and X
Normalize the RIR segments and the microphone design matrix.


In [None]:
# 6) Normalization (simple, consistent)
# 6.1 Normalize Y_echo (waveform features)
# - remove per-sample DC offset per RIR segment
Y_echo = Y_echo - Y_echo.mean(axis=1, keepdims=True)

# - global standardization across the entire subset (recommended)
mu = float(Y_echo.mean())
sigma = float(Y_echo.std() + 1e-8)
Y_echo_norm = (Y_echo - mu) / sigma

# optional clipping for stability
Y_echo_norm = np.clip(Y_echo_norm, -5.0, 5.0).astype(np.float32)

print(f"Y_echo normalization: mu={mu:.6g}, sigma={sigma:.6g}")
print(
    "Y_echo_norm stats: mean=%.4f, std=%.4f" % (Y_echo_norm.mean(), Y_echo_norm.std())
)

# 6.2 Normalize X_design
if use_positions:
    X_design = X_design.astype(np.float32)
    # simple room-dimension scaling
    X_design_norm = X_design / ROOM_DIMS.reshape(1, 3)
    print(f"X_design_norm (positions/ROOM_DIMS) | shape={X_design_norm.shape}")
else:
    # discrete labels
    X_design_norm = X_design
    print("X_design is discrete mic indices; no normalization applied.")


## Inspect mic positions
Ranges and a 3D scatter when positions are available.


In [None]:
if use_positions:
    print("Mic position ranges (meters):")
    print(f"  x: [{X_design[:,0].min():.3f}, {X_design[:,0].max():.3f}]")
    print(f"  y: [{X_design[:,1].min():.3f}, {X_design[:,1].max():.3f}]")
    print(f"  z: [{X_design[:,2].min():.3f}, {X_design[:,2].max():.3f}]")

    # simple 3D scatter (matplotlib default colors)
    fig = plt.figure(figsize=(6, 5))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(X_design[:, 0], X_design[:, 1], X_design[:, 2], s=30)
    ax.set_title("Mic positions (meters) for the selected room+source")
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_zlabel("z")
    plt.tight_layout()
    plt.show()
else:
    print("No mic positions available; using discrete mic indices.")


## Final tensors
Ready to train inverse models. Optionally save to disk.


In [None]:
print("READY FOR TRAINING:")
print(f"  Inverse input  (Y_echo_norm): {Y_echo_norm.shape}")
print(f"  Inverse output (X_design_norm): {np.shape(X_design_norm)}")

# Optional: save for reuse
# np.save("Y_echo_norm.npy", Y_echo_norm)
# np.save("X_design_norm.npy", X_design_norm)
# print("Saved: Y_echo_norm.npy, X_design_norm.npy")
