# Prepara CSV por arquivos

## CONSTANTES DE CONFIGURAÇÃO

In [73]:
from __future__ import annotations

import csv
import json
from pathlib import Path
from typing import Optional, Tuple, Iterator, Dict, List
from pynwb import NWBFile
import numpy as np
import pandas as pd
from collections import defaultdict
from pynwb import NWBHDF5IO
from pynwb.file import NWBFile

In [74]:
DATA_DIR = Path("arq")                      # onde estão os .nwb
INDEX_JSON = Path("index.json")             # índice que lista tasks -> arquivos .nwb
OUTPUT_ROOT = Path("outputs_by_task")       # saída por task (subpastas)
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
CONSOLIDATED_DIR = Path("nwb_consolidated_csv_by_task")
CONSOLIDATED_DIR.mkdir(parents=True, exist_ok=True)

## Funções Auxiliares

In [75]:
def list_nwb_files(data_dir: Path) -> List[Path]:
    """Lista arquivos .nwb na pasta (recursivo), ordenados."""
    return sorted([p for p in Path(data_dir).rglob("*.nwb")])

def open_nwb(nwb_path: Path) -> Tuple[NWBHDF5IO, NWBFile]:
    """
    Abre o NWB e retorna (io, nwbfile).
    IMPORTANTE: chame io.close() quando terminar.
    """
    io = NWBHDF5IO(str(nwb_path), mode="r", load_namespaces=True)
    nwb = io.read()
    return io, nwb



## Metadados

In [76]:
def get_epochs_df(nwbfile: NWBFile) -> Optional[pd.DataFrame]:
    """DataFrame CRU de epochs, ou None se não houver."""
    epochs = getattr(nwbfile, "epochs", None)
    return None if epochs is None else epochs.to_dataframe()

def get_units_table(nwbfile: NWBFile) -> Optional[pd.DataFrame]:
    """DataFrame CRU de units (todos os campos presentes no NWB), sem filtrar."""
    units = getattr(nwbfile, "units", None)
    return None if units is None else units.to_dataframe()

def get_session_meta(nwbfile: NWBFile) -> Dict[str, object]:
    """Metadados básicos da sessão (crus)."""
    subj = getattr(nwbfile, "subject", None)
    return {
        "identifier": nwbfile.identifier,
        "session_description": nwbfile.session_description,
        "session_start_time": nwbfile.session_start_time,
        "timestamps_reference_time": nwbfile.timestamps_reference_time,
        "subject_id": getattr(subj, "subject_id", None) if subj else None,
        "subject_description": getattr(subj, "description", None) if subj else None,
        "institution": getattr(nwbfile, "institution", None),
        "lab": getattr(nwbfile, "lab", None),
    }

# Handlers

In [77]:
def get_spike_handles(nwbfile: NWBFile) -> Tuple[Optional[object], Optional[object], Optional[np.ndarray]]:
    """
    Retorna *handles* HDF5 para leitura preguiçosa de spikes:
    (ds_times, ds_index, unit_ids) ou (None, None, None).
    """
    units = getattr(nwbfile, "units", None)
    if units is None or len(units) == 0:
        return None, None, None
    ds_times = units["spike_times"].data
    ds_index = units["spike_times_index"].data
    unit_ids = units.id.data[:]
    return ds_times, ds_index, unit_ids

def get_epochs_df(nwbfile: NWBFile) -> Optional[pd.DataFrame]:
    epochs = getattr(nwbfile, "epochs", None)
    return None if epochs is None else epochs.to_dataframe()


def iter_unit_spike_slices(nwbfile: NWBFile) -> Iterator[Tuple[int, int, int, int]]:
    """Itera (pos, unit_id, start_idx, stop_idx) para fatiar ds_times[start:stop]."""
    ds_times, ds_index, unit_ids = get_spike_handles(nwbfile)
    if ds_times is None:
        return
    start = 0
    for i, stop in enumerate(ds_index[:]):
        yield i, int(unit_ids[i]), int(start), int(stop)
        start = int(stop)

def load_unit_spike_times(ds_times, start_idx: int, stop_idx: int) -> np.ndarray:
    """Carrega os spikes de UMA unidade (fatia do dataset)."""
    return np.asarray(ds_times[start_idx:stop_idx])


def get_behavior_interfaces(nwbfile: NWBFile) -> Dict[str, object]:
    """Lista as interfaces de comportamento dentro de processing/behavior."""
    beh = nwbfile.processing.get("behavior", None)
    return {} if beh is None else dict(beh.data_interfaces.items())

def get_behavior_time_series(
    nwbfile: NWBFile,
    name_filter: Optional[str] = None
) -> List[Dict[str, object]]:
    """
    Retorna lista de séries de comportamento cruas.
    Cada item = {"name", "data", "timestamps", "rate", "raw_obj"}.
    """
    out: List[Dict[str, object]] = []
    for name, obj in get_behavior_interfaces(nwbfile).items():
        # Containers com múltiplas séries
        if hasattr(obj, "time_series") and obj.time_series:
            for ts_name, ts in obj.time_series.items():
                nm = ts.name or ts_name or name
                if name_filter and name_filter.lower() not in nm.lower():
                    continue
                data = np.asarray(ts.data)
                t = np.asarray(ts.timestamps) if ts.timestamps is not None else None
                out.append({"name": nm, "data": data, "timestamps": t,
                            "rate": getattr(ts, "rate", None), "raw_obj": ts})
        # Séries espaciais
        if hasattr(obj, "spatial_series") and obj.spatial_series:
            for ts_name, ts in obj.spatial_series.items():
                nm = ts.name or ts_name or name
                if name_filter and name_filter.lower() not in nm.lower():
                    continue
                data = np.asarray(ts.data)
                t = np.asarray(ts.timestamps) if ts.timestamps is not None else None
                out.append({"name": nm, "data": data, "timestamps": t,
                            "rate": getattr(ts, "rate", None), "raw_obj": ts})
        # Série direta
        if hasattr(obj, "data"):
            nm = getattr(obj, "name", None) or name
            if (name_filter is None) or (name_filter.lower() in (nm or "").lower()):
                data = np.asarray(obj.data)
                t = np.asarray(obj.timestamps) if getattr(obj, "timestamps", None) is not None else None
                out.append({"name": nm, "data": data, "timestamps": t,
                            "rate": getattr(obj, "rate", None), "raw_obj": obj})
    return out

## Spikes

In [78]:
def save_spike_times_to_csv(
    nwbfile: NWBFile,
    output_dir: Path,
    nwb_name: str,
    extra_cols: Optional[Dict[str, object]] = None,
) -> Optional[Path]:
    """
    Usa seus helpers:
      - get_spike_handles(nwbfile)
      - iter_unit_spike_slices(nwbfile)
      - load_unit_spike_times(ds_times, a, b)
    Gera: <nwb_name>_spike_times.csv com colunas [file_id, unit_id, spike_time, ...extras].
    """
    extra_cols = extra_cols or {}
    ds_times, ds_index, unit_ids = get_spike_handles(nwbfile)
    if ds_times is None or ds_index is None:
        print("   Sem spikes (spike_times/spike_times_index ausentes).")
        return None

    file_id = nwbfile.identifier or nwb_name
    rows: List[pd.DataFrame] = []

    for _, uid, a, b in iter_unit_spike_slices(nwbfile):
        st_u = load_unit_spike_times(ds_times, a, b)
        if st_u.size == 0:
            continue
        df_u = pd.DataFrame({"file_id": file_id, "unit_id": int(uid), "spike_time": st_u})
        for k, v in extra_cols.items():
            df_u[k] = v
        rows.append(df_u)

    if not rows:
        print("   Sem spikes para salvar (tabela vazia).")
        return None

    df = pd.concat(rows, ignore_index=True)
    out = Path(output_dir) / f"{nwb_name}_spike_times.csv"
    df.to_csv(out, index=False)
    print(f"   Spikes salvos: {len(df):,} -> {out.name}")
    return out

def save_behavior_series_to_csv(
    nwbfile: NWBFile,
    output_dir: Path,
    nwb_name: str,
    extra_cols: Optional[Dict[str, object]] = None,
    name_filter: Optional[str] = None,
) -> List[Path]:
    """
    Usa seu get_behavior_time_series(...) para exportar cada série:
    - Se 1D -> 1 coluna com o nome
    - Se 2D -> name_dim0, name_dim1, ...
    Inclui timestamps quando existirem.
    """
    extra_cols = extra_cols or {}
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    saved = []
    series_list = get_behavior_time_series(nwbfile, name_filter=name_filter)

    for s in series_list:
        name = s["name"]
        data = np.asarray(s["data"])
        t = np.asarray(s["timestamps"]) if s["timestamps"] is not None else None

        # constrói DF
        if t is not None:
            df = pd.DataFrame({"timestamp": t})
        else:
            df = pd.DataFrame({"idx": np.arange(len(data))})

        if data.ndim == 1:
            df[name] = data
        elif data.ndim == 2:
            for j in range(data.shape[1]):
                df[f"{name}_dim{j}"] = data[:, j]
        else:
            continue  # ignorar nd>2 para manter simples

        for k, v in extra_cols.items():
            df[k] = v

        safe = name.replace("/", "_").replace("\\", "_").replace(" ", "_")
        out = output_dir / f"{nwb_name}__beh_{safe}.csv"
        df.to_csv(out, index=False)
        saved.append(out)

    print(f"   Séries de comportamento salvas: {len(saved)} arquivo(s)")
    return saved

def save_epochs_to_csv(
    nwbfile: NWBFile,
    output_dir: Path,
    nwb_name: str,
    extra_cols: Optional[Dict[str, object]] = None,
) -> Optional[Path]:
    """
    Exporta a tabela de epochs (se existir) para CSV único.
    Usa seu get_epochs_df(nwbfile).
    Nome: <nwb_name>__epochs.csv
    """
    extra_cols = extra_cols or {}
    df = get_epochs_df(nwbfile)
    if df is None or df.empty:
        print("   Nenhum epoch para salvar.")
        return None

    # garante coluna id sequencial (o .to_dataframe geralmente já traz o index)
    df_out = df.reset_index(drop=False).rename(columns={"index": "epoch_id"})
    for k, v in extra_cols.items():
        df_out[k] = v

    out = Path(output_dir) / f"{nwb_name}__epochs.csv"
    df_out.to_csv(out, index=False)
    print(f"   Epochs salvos: {len(df_out):,} -> {out.name}")
    return out


## Loop principal: iterar por task e arquivo da task

In [79]:
import json

DATA_DIR = Path("arq")               # onde estão seus .nwb
INDEX_JSON = Path("index.json")      # mapeia task -> arquivos
OUTPUT_ROOT = Path("outputs_by_task")
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

def _slug(s: str) -> str:
    return (str(s).replace("/", "_").replace("\\", "_")
                 .replace(" ", "_").replace(":", "-"))

# Carrega índice
with open(INDEX_JSON, "r", encoding="utf-8") as f:
    idx = json.load(f)

task_index: dict = idx.get("task_index", {})

# Mapeia arquivos existentes
available = {p.name: p for p in list_nwb_files(DATA_DIR)}

for task_name, files_map in task_index.items():
    task_slug = _slug(task_name)
    out_dir = OUTPUT_ROOT / task_slug
    out_dir.mkdir(parents=True, exist_ok=True)

    expected = set(files_map.keys())
    missing = sorted(expected - set(available.keys()))
    if missing:
        print(f"[{task_name}] faltando {len(missing)} arquivo(s) em {DATA_DIR}: {missing[:3]} ...")

    for fname in sorted(expected & set(available.keys())):
        nwb_path = available[fname]
        nwb_name = nwb_path.stem
        print(f"\n=== Task: {task_name} | Arquivo: {fname} ===")

        io, nwb = open_nwb(nwb_path)
        try:
            # Metadados
            meta = get_session_meta(nwb)
            meta_df = pd.DataFrame([meta])
            meta_df.insert(0, "task", task_name)
            meta_df.insert(0, "source_file", fname)
            meta_df.to_csv(out_dir / f"{nwb_name}__task-{task_slug}__meta.csv", index=False)

            # Units CRU
            df_units = get_units_table(nwb)
            if df_units is not None and not df_units.empty:
                dfu = df_units.copy()
                dfu.insert(0, "task", task_name)
                dfu.insert(0, "source_file", fname)
                dfu.to_csv(out_dir / f"{nwb_name}__task-{task_slug}__units.csv", index=False)

            # Spikes
            save_spike_times_to_csv(
                nwbfile=nwb,
                output_dir=out_dir,
                nwb_name=f"{nwb_name}__task-{task_slug}",
                extra_cols={"source_file": fname, "task": task_name}
            )

            # Séries comportamento (opcional: filtrar por "speed" ou "position")
            save_behavior_series_to_csv(
                nwbfile=nwb,
                output_dir=out_dir,
                nwb_name=f"{nwb_name}__task-{task_slug}",
                extra_cols={"source_file": fname, "task": task_name},
                name_filter=None  # ou "speed" / "position"
            )

            # Epochs
            save_epochs_to_csv(
                nwbfile=nwb,
                output_dir=out_dir,
                nwb_name=f"{nwb_name}__task-{task_slug}",
                extra_cols={"source_file": fname, "task": task_name},
            )

        except Exception as e:
            print(f"ERRO em {fname} ({task_name}): {e}")
        finally:
            io.close()

print("\n✔️ Loop por task concluído. Saídas em:", OUTPUT_ROOT.resolve())



=== Task: Foraging task | Arquivo: 26863_2020-Nov-04_12-46-06.nwb ===
   Spikes salvos: 1,072 -> 26863_2020-Nov-04_12-46-06__task-Foraging_task_spike_times.csv
   Séries de comportamento salvas: 4 arquivo(s)
   Epochs salvos: 1 -> 26863_2020-Nov-04_12-46-06__task-Foraging_task__epochs.csv

=== Task: Foraging task | Arquivo: 26863_2020-Nov-05_10-26-23.nwb ===
   Spikes salvos: 721 -> 26863_2020-Nov-05_10-26-23__task-Foraging_task_spike_times.csv
   Séries de comportamento salvas: 4 arquivo(s)
   Epochs salvos: 1 -> 26863_2020-Nov-05_10-26-23__task-Foraging_task__epochs.csv

=== Task: Foraging task | Arquivo: 26965_2020-Dec-07_15-52-32.nwb ===
   Spikes salvos: 393 -> 26965_2020-Dec-07_15-52-32__task-Foraging_task_spike_times.csv
   Séries de comportamento salvas: 4 arquivo(s)
   Epochs salvos: 1 -> 26965_2020-Dec-07_15-52-32__task-Foraging_task__epochs.csv

=== Task: Foraging task | Arquivo: 26965_2020-Dec-08_11-29-48.nwb ===
   Spikes salvos: 240 -> 26965_2020-Dec-08_11-29-48__task-Fo

In [80]:
INPUT_ROOT = OUTPUT_ROOT
CONSOLIDATED_DIR = Path("nwb_consolidated_csv_by_task")
CONSOLIDATED_DIR.mkdir(parents=True, exist_ok=True)


groups = {
    "meta":        "__meta.csv",
    "units":       "__units.csv",
    "spike_times": "_spike_times.csv",
    "epochs":      "__epochs.csv",
}

for group_name, pattern in groups.items():
    paths = sorted(INPUT_ROOT.rglob(f"*{pattern}*"))
    if not paths:
        print(f"[{group_name}] nada encontrado (*{pattern}*) em {INPUT_ROOT}")
        continue

    parts = []
    for p in paths:
        try:
            df = pd.read_csv(p, low_memory=False)
            if "task" not in df.columns:
                df.insert(0, "task", p.parent.name)
            if "source_file" not in df.columns:
                base = p.name.split("__task-")[0]
                if not base.endswith(".nwb"):
                    base += ".nwb"
                df.insert(0, "source_file", base)
            parts.append(df)
        except Exception as e:
            print(f"[{group_name}] Falha lendo {p.name}: {e}")

    if not parts:
        print(f"[{group_name}] nenhum arquivo válido para consolidar.")
        continue

    final = pd.concat(parts, ignore_index=True)
    cols = final.columns.tolist()
    for first in ["source_file", "task"]:
        if first in cols:
            cols = [first] + [c for c in cols if c != first]
    final = final[cols]

    out = CONSOLIDATED_DIR / f"consolidated__{group_name}.csv"
    final.to_csv(out, index=False)
    print(f"[{group_name}] SALVO: {out} ({len(final):,} linhas)")


# comportamento
# 2) Consolida BEHAVIOR: um arquivo por série (nome completo "behavior_*")
beh_paths = sorted(INPUT_ROOT.rglob("*__beh_*.csv"))
if not beh_paths:
    print("[beh] nada encontrado (*__beh_*.csv)")
else:
    by_series = defaultdict(list)
    for p in beh_paths:
        # exemplo: <nwb>__task-<slug>__beh_animal_position.csv
        tail = p.name.split("__beh_")[-1]
        base = f"behavior_{tail.replace('.csv','')}"   # <- agora sempre com prefixo behavior_
        by_series[base].append(p)

    for base, paths in by_series.items():
        parts = []
        for p in paths:
            try:
                df = pd.read_csv(p, low_memory=False)
                if "task" not in df.columns:
                    df.insert(0, "task", p.parent.name)
                if "source_file" not in df.columns:
                    src = p.name.split("__task-")[0]
                    if not src.endswith(".nwb"):
                        src += ".nwb"
                    df.insert(0, "source_file", src)
                parts.append(df)
            except Exception as e:
                print(f"[{base}] Falha lendo {p.name}: {e}")

        if not parts:
            print(f"[{base}] nenhum arquivo válido para consolidar.")
            continue

        final = pd.concat(parts, ignore_index=True)
        cols = final.columns.tolist()
        for first in ["source_file", "task"]:
            if first in cols:
                cols = [first] + [c for c in cols if c != first]
        final = final[cols]

        out = CONSOLIDATED_DIR / f"consolidated__{base}.csv"
        final.to_csv(out, index=False)
        print(f"[{base}] SALVO: {out} ({len(final):,} linhas)")


[meta] SALVO: nwb_consolidated_csv_by_task\consolidated__meta.csv (57 linhas)
[units] SALVO: nwb_consolidated_csv_by_task\consolidated__units.csv (136,848 linhas)
[spike_times] SALVO: nwb_consolidated_csv_by_task\consolidated__spike_times.csv (136,848 linhas)
[epochs] SALVO: nwb_consolidated_csv_by_task\consolidated__epochs.csv (125 linhas)
[behavior_animal_position] SALVO: nwb_consolidated_csv_by_task\consolidated__behavior_animal_position.csv (16,403,263 linhas)
[behavior_head_direction] SALVO: nwb_consolidated_csv_by_task\consolidated__behavior_head_direction.csv (16,403,263 linhas)
[behavior_head_yaw_speed] SALVO: nwb_consolidated_csv_by_task\consolidated__behavior_head_yaw_speed.csv (16,403,263 linhas)
[behavior_movement_speed] SALVO: nwb_consolidated_csv_by_task\consolidated__behavior_movement_speed.csv (16,403,263 linhas)
