# EDA de grabaciones VizDoom
Este notebook inspecciona la **estructura y formato** de archivos de grabación: `.npz`, `.parquet`, `.json` y `.mkv`.

> Modo muestra por defecto: pocas sesiones y pocos archivos por tipo para evitar salida excesiva.

In [1]:
from pathlib import Path
from typing import Any
import json

import cv2
import numpy as np
import pandas as pd

try:
    from fastparquet import ParquetFile
except Exception:
    ParquetFile = None

# Configuración de muestra
RECORDINGS_PATH = Path("recordings")
SESSION = "session_20260205_194411"  # None para tomar por muestreo
MAX_SESSIONS = 1
MAX_FILES_PER_TYPE = 2

In [2]:
def _numeric_stats(arr: np.ndarray) -> str:
    if arr.size == 0:
        return "empty"
    if np.issubdtype(arr.dtype, np.number):
        return f"min={arr.min()} max={arr.max()}"
    return "non-numeric"


def summarize_npz(npz_path: Path) -> dict[str, Any]:
    summary: dict[str, Any] = {"file": npz_path.name, "arrays": []}
    with np.load(npz_path, allow_pickle=False) as data:
        for key in data.files:
            arr = data[key]
            summary["arrays"].append(
                {
                    "key": key,
                    "shape": tuple(arr.shape),
                    "dtype": str(arr.dtype),
                    "stats": _numeric_stats(arr),
                }
            )
    return summary


def summarize_json(json_path: Path) -> dict[str, Any]:
    with json_path.open("r", encoding="utf-8") as file:
        payload = json.load(file)

    summary: dict[str, Any] = {
        "file": json_path.name,
        "root_type": type(payload).__name__,
    }

    if isinstance(payload, dict):
        keys = list(payload.keys())
        summary["num_keys"] = len(keys)
        summary["keys_preview"] = keys[:15]
        summary["value_types"] = {key: type(value).__name__ for key, value in payload.items()}
    elif isinstance(payload, list):
        summary["length"] = len(payload)
        if payload:
            summary["item_type"] = type(payload[0]).__name__
    return summary

In [3]:
def summarize_parquet(parquet_path: Path) -> dict[str, Any]:
    summary: dict[str, Any] = {"file": parquet_path.name}

    if ParquetFile is not None:
        pq = ParquetFile(str(parquet_path))
        summary["rows"] = int(pq.count())
        summary["columns"] = list(pq.columns)
        summary["num_row_groups"] = len(pq.row_groups)
        summary["dtypes"] = {col: str(dtype) for col, dtype in pq.dtypes.items()}
        return summary

    df = pd.read_parquet(parquet_path)
    summary["rows"] = len(df)
    summary["columns"] = list(df.columns)
    summary["dtypes"] = {col: str(dtype) for col, dtype in df.dtypes.items()}
    return summary


def summarize_mkv(video_path: Path) -> dict[str, Any]:
    summary: dict[str, Any] = {"file": video_path.name}
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        summary["error"] = "No se pudo abrir el video"
        return summary

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = float(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    duration_s = (frame_count / fps) if fps > 0 else None

    summary["frame_count"] = frame_count
    summary["fps"] = fps
    summary["resolution"] = (height, width)
    summary["duration_s"] = duration_s

    ok, frame = cap.read()
    if ok and frame is not None:
        summary["frame_shape"] = tuple(frame.shape)
        summary["frame_dtype"] = str(frame.dtype)
        summary["frame_stats"] = _numeric_stats(frame)
    cap.release()
    return summary

In [4]:
def _sample_files(files: list[Path], max_files: int) -> tuple[list[Path], int]:
    if max_files <= 0:
        return files, 0
    sampled = files[:max_files]
    omitted = max(0, len(files) - len(sampled))
    return sampled, omitted


def inspect_session(session_path: Path, max_files_per_type: int) -> None:
    print(f"\n=== SESSION: {session_path.name} ===")

    npz_files = sorted(session_path.glob("*.npz"))
    parquet_files = sorted(session_path.glob("*.parquet"))
    json_files = sorted(session_path.glob("*.json"))
    mkv_files = sorted(session_path.glob("*.mkv"))

    print(
        f"Archivos detectados -> npz: {len(npz_files)}, parquet: {len(parquet_files)}, json: {len(json_files)}, mkv: {len(mkv_files)}"
    )

    if npz_files:
        print("\n[NPZ]")
        npz_sample, npz_omitted = _sample_files(npz_files, max_files_per_type)
        for path in npz_sample:
            info = summarize_npz(path)
            print(f"- {info['file']}")
            for arr in info["arrays"]:
                print(
                    f"  • key={arr['key']} shape={arr['shape']} dtype={arr['dtype']} {arr['stats']}"
                )
        if npz_omitted:
            print(f"  • ... ({npz_omitted} archivos NPZ omitidos en la muestra)")

    if parquet_files:
        print("\n[PARQUET]")
        parquet_sample, parquet_omitted = _sample_files(parquet_files, max_files_per_type)
        for path in parquet_sample:
            info = summarize_parquet(path)
            print(f"- {info['file']} | rows={info.get('rows')} | row_groups={info.get('num_row_groups', 'n/a')}")
            print(f"  • columns: {info.get('columns', [])}")
            print(f"  • dtypes: {info.get('dtypes', {})}")
        if parquet_omitted:
            print(f"  • ... ({parquet_omitted} archivos Parquet omitidos en la muestra)")

    if json_files:
        print("\n[JSON]")
        json_sample, json_omitted = _sample_files(json_files, max_files_per_type)
        for path in json_sample:
            info = summarize_json(path)
            print(f"- {info['file']} | root={info['root_type']}")
            if "num_keys" in info:
                print(f"  • num_keys={info['num_keys']}")
                print(f"  • keys_preview={info['keys_preview']}")
                print(f"  • value_types={info['value_types']}")
            if "length" in info:
                print(f"  • length={info['length']} item_type={info.get('item_type')}")
        if json_omitted:
            print(f"  • ... ({json_omitted} archivos JSON omitidos en la muestra)")

    if mkv_files:
        print("\n[MKV]")
        mkv_sample, mkv_omitted = _sample_files(mkv_files, max_files_per_type)
        for path in mkv_sample:
            info = summarize_mkv(path)
            if "error" in info:
                print(f"- {info['file']} | error={info['error']}")
                continue
            print(
                f"- {info['file']} | frames={info.get('frame_count')} fps={info.get('fps')} res={info.get('resolution')} duration_s={info.get('duration_s')}"
            )
            if "frame_shape" in info:
                print(
                    f"  • sample_frame shape={info['frame_shape']} dtype={info['frame_dtype']} {info['frame_stats']}"
                )
        if mkv_omitted:
            print(f"  • ... ({mkv_omitted} archivos MKV omitidos en la muestra)")


def resolve_sessions(recordings_path: Path, session: str | None, max_sessions: int) -> tuple[list[Path], int]:
    if session:
        explicit = recordings_path / session
        if explicit.exists() and explicit.is_dir():
            return [explicit], 0
        raise FileNotFoundError(f"No se encontró la sesión: {explicit}")

    sessions = sorted([entry for entry in recordings_path.iterdir() if entry.is_dir()])
    if not sessions:
        raise FileNotFoundError(f"No hay sesiones dentro de: {recordings_path}")
    if max_sessions <= 0:
        return sessions, 0
    sampled = sessions[:max_sessions]
    omitted = max(0, len(sessions) - len(sampled))
    return sampled, omitted

## Ejecutar inspección
Ajusta `SESSION`, `MAX_SESSIONS` o `MAX_FILES_PER_TYPE` en la celda de configuración y luego ejecuta esta celda.

In [5]:
if not RECORDINGS_PATH.exists() or not RECORDINGS_PATH.is_dir():
    raise FileNotFoundError(f"No se encontró la carpeta recordings: {RECORDINGS_PATH}")

sessions, omitted_sessions = resolve_sessions(RECORDINGS_PATH, SESSION, MAX_SESSIONS)
for session_path in sessions:
    inspect_session(session_path, MAX_FILES_PER_TYPE)

if omitted_sessions:
    print(f"\n... ({omitted_sessions} sesiones omitidas en la muestra)")


=== SESSION: session_20260205_194411 ===
Archivos detectados -> npz: 22, parquet: 1, json: 1, mkv: 1

[NPZ]
- depth_chunk_000.npz
  • key=frames shape=(350, 240, 320) dtype=uint16 min=0 max=255
- depth_chunk_001.npz
  • key=frames shape=(350, 240, 320) dtype=uint16 min=0 max=255
  • ... (20 archivos NPZ omitidos en la muestra)

[PARQUET]
- meta.parquet | rows=3735 | row_groups=1
  • columns: ['t_index', 'action_bin', 'action_names', 'reward', 'is_terminal', 'is_timeout', 'lives', 'health', 'armor', 'killcount', 'timestamp_s', 'terminal_reason', 'doom_wad', 'doom_map', 'doom_skill', 'selected_weapon', 'selected_weapon_ammo', 'ammo1', 'ammo2', 'ammo3', 'ammo4', 'weapon1', 'weapon2', 'weapon3', 'weapon4', 'weapon5', 'weapon6', 'weapon7', 'cumulative_reward_video']
  • dtypes: {'t_index': 'int64', 'action_bin': 'object', 'action_names': 'object', 'reward': 'float64', 'is_terminal': 'bool', 'is_timeout': 'bool', 'lives': 'int64', 'health': 'float64', 'armor': 'float64', 'killcount': 'float