In [None]:
from pathlib import Path
import pandas as pd
import json
import warnings
import soundfile as sf

DATA_DIR = Path("/Users/jia/datasets/amps")

def load_amp_datasets(directory=DATA_DIR, recursive=False):
    """
    Load files from directory into a dict keyed by filename (stem).
    Supports CSV, TSV, JSON (ndjson or standard), parquet, feather, pickle,
    excel, text, and common audio (wav/mp3) if soundfile is available.
    """
    if not directory.exists():
        raise FileNotFoundError(f"{directory} does not exist")
    files = directory.rglob("*") if recursive else directory.iterdir()
    datasets = {}
    # optional audio loader
    try:
    except Exception:
        sf = None

    for p in sorted(files):
        if p.is_dir():
            continue
        key = p.stem
        suf = p.suffix.lower()
        try:
            if suf == ".csv":
                obj = pd.read_csv(p)
            elif suf in (".tsv", ".tab"):
                obj = pd.read_csv(p, sep="\t")
            elif suf in (".parquet",):
                try:
                    obj = pd.read_parquet(p)
                except Exception as e:
                    warnings.warn(f"parquet load failed for {p}: {e}")
                    obj = p
            elif suf in (".feather",):
                try:
                    obj = pd.read_feather(p)
                except Exception as e:
                    warnings.warn(f"feather load failed for {p}: {e}")
                    obj = p
            elif suf in (".json",):
                # try newline-delimited first, then standard json
                try:
                    obj = pd.read_json(p, lines=True)
                except Exception:
                    try:
                        obj = pd.read_json(p)
                    except Exception:
                        with p.open("r", encoding="utf-8") as f:
                            obj = json.load(f)
            elif suf in (".ndjson", ".jsonl"):
                obj = pd.read_json(p, lines=True)
            elif suf in (".pkl", ".pickle"):
                try:
                    obj = pd.read_pickle(p)
                except Exception as e:
                    warnings.warn(f"pickle load failed for {p}: {e}")
                    obj = p
            elif suf in (".xls", ".xlsx"):
                try:
                    obj = pd.read_excel(p)
                except Exception as e:
                    warnings.warn(f"excel load failed for {p}: {e}")
                    obj = p
            elif suf in (".txt", ".log"):
                obj = p.read_text(encoding="utf-8", errors="replace")
            elif suf in (".wav", ".mp3", ".flac") and sf is not None:
                try:
                    data, sr = sf.read(str(p))
                    obj = {"audio": data, "samplerate": sr}
                except Exception as e:
                    warnings.warn(f"audio load failed for {p}: {e}")
                    obj = p
            else:
                # unknown type: store Path so it can be handled later
                obj = p
        except Exception as e:
            warnings.warn(f"Failed to load {p}: {e}")
            obj = p

        # handle duplicate stems by appending suffix if needed
        if key in datasets:
            key = f"{p.stem}{p.suffix}"
            if key in datasets:
                key = str(p.name)
        datasets[key] = obj

    # print a short summary
    summary_lines = []
    for name, obj in datasets.items():
        if isinstance(obj, pd.DataFrame):
            summary = f"DataFrame {obj.shape}"
        elif isinstance(obj, pd.Series):
            summary = f"Series {obj.shape}"
        elif isinstance(obj, dict) and "audio" in obj:
            summary = f"Audio {obj['audio'].shape} @ {obj['samplerate']}Hz"
        elif isinstance(obj, (list, tuple, dict)):
            try:
                summary = f"{type(obj).__name__} len={len(obj)}"
            except Exception:
                summary = type(obj).__name__
        elif isinstance(obj, Path):
            summary = f"Path ({obj.suffix})"
        elif isinstance(obj, str):
            summary = f"text len={len(obj)}"
        else:
            summary = type(obj).__name__
        summary_lines.append(f"{name}: {summary}")
    print(f"Loaded {len(datasets)} items from {directory}:")
    for line in summary_lines:
        print(" -", line)

    return datasets

# load into a variable named `datasets` for later cells to use
datasets = load_amp_datasets()

: 