<a href="https://colab.research.google.com/github/Log-Yair/Endymion/blob/data/data_handler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
from __future__ import annotations

import os
import re
import json
import math
import hashlib
import urllib.request
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Optional, Tuple, List, Any

import numpy as np

In [30]:

'''
-------------------
Data structures
-------------------
'''
@dataclass(frozen=True)
class LOLATileSpec:
    """
    Identifies a LOLA polar float_img tile by its base filename (without extension).
    Example: 'ldem_85s_20m_float'
    """
    tile_id: str
    img_url: str
    lbl_url: str

    def filenames(self) -> Tuple[str, str]:
        return f"{self.tile_id}.img", f"{self.tile_id}.lbl"


@dataclass
class PDSImageMeta:
    """
    Minimal metadata parsed from a PDS3 .LBL label file.
    We keep it flexible; add fields as you discover them in your labels.
    """
    record_bytes: Optional[int] = None
    file_records: Optional[int] = None

    # Common IMAGE keys (names vary across PDS labels)
    lines: Optional[int] = None
    line_samples: Optional[int] = None
    sample_bits: Optional[int] = None
    sample_type: Optional[str] = None
    scaling_factor: Optional[float] = None
    offset: Optional[float] = None
    missing_constant: Optional[float] = None

    # Extra: keep raw label for debugging
    raw: Optional[Dict[str, Any]] = None


@dataclass
class RasterTile:
    """
    A tile of raster data plus meta. For now we keep it in memory as numpy.
    In future you might store it as memmap or Cloud Optimized GeoTIFF.
    """
    tile_id: str
    data: np.ndarray  # shape (rows, cols)
    meta: PDSImageMeta


'''
----------------------------
Data handler
----------------------------
'''

class DataHandler:
    """
    Endymion - DataHandler
    Responsibility:
      - Download/caching of PDS .IMG/.LBL
      - Parse labels (PDS3-ish) to interpret .IMG binary
      - Load raster data as numpy arrays (float tiles)
      - Provide ROI extraction hooks for FeatureExtractor / Hazard pipeline

    Notes:
      - Keep this module "dumb and reliable":
        no slope/roughness here (that belongs in FeatureExtractor).
      - Focus on: ingestion, validation, standardisation, and serving.
    """
    _kv_re = re.compile(r"^\s*([A-Z0-9_\-^]+)\s*=\s*(.+?)\s*$")


    def __init__(
        self,
        cache_dir: str | Path = "/content/endymion_cache/lola",
        user_agent: str = "Endymion-DataHandler/0.1",
        timeout_sec: int = 60,
    ) -> None:
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

        self.user_agent = user_agent
        self.timeout_sec = timeout_sec

        # Simple in-memory registry (you can persist this later)
        self.tiles: Dict[str, LOLATileSpec] = {}

    '''
    ----------------------------
     Registration
    ----------------------------
    '''

    def register_tile(self, spec: LOLATileSpec) -> None:
        self.tiles[spec.tile_id] = spec

    def register_tiles(self, specs: List[LOLATileSpec]) -> None:
        for s in specs:
            self.register_tile(s)

    '''
    ----------------------------
     Download & caching
    ----------------------------
    '''

    def _local_paths(self, tile_id: str) -> Tuple[Path, Path]:
        img_path = self.cache_dir / f"{tile_id}.img"
        lbl_path = self.cache_dir / f"{tile_id}.lbl"
        return img_path, lbl_path

    def _download_file(self, url: str, out_path: Path) -> None:
        out_path.parent.mkdir(parents=True, exist_ok=True)

        req = urllib.request.Request(
            url,
            headers={"User-Agent": self.user_agent},
            method="GET",
        )
        with urllib.request.urlopen(req, timeout=self.timeout_sec) as resp:
            data = resp.read()

        tmp = out_path.with_suffix(out_path.suffix + ".part")
        tmp.write_bytes(data)
        tmp.replace(out_path)

    def ensure_downloaded(self, tile_id: str, force: bool = False) -> Tuple[Path, Path]:
        if tile_id not in self.tiles:
            raise KeyError(f"Tile '{tile_id}' is not registered.")

        spec = self.tiles[tile_id]
        img_path, lbl_path = self._local_paths(tile_id)

        if force or not lbl_path.exists():
            self._download_file(spec.lbl_url, lbl_path)

        if force or not img_path.exists():
            self._download_file(spec.img_url, img_path)

        return img_path, lbl_path
    '''
    ----------------------------
     Label parsing (PDS3-ish)
    ----------------------------
    '''

    # The _kv_re regex is already defined at the class level
    # _kv_re = re.compile(r"^\s*([A-Z0-9_\-]+)\s*=\s*(.+?)\s*$")

    def parse_lbl(self, lbl_path: str | Path) -> PDSImageMeta:
        """
        Parses a basic PDS3 label into a dict and extracts common keys.
        The LOLA LDEM polar float labels are usually straightforward.
        """
        lbl_path = Path(lbl_path)
        lines = lbl_path.read_text(encoding="utf-8", errors="ignore").splitlines()

        raw: Dict[str, Any] = {}
        current_object_stack: List[str] = []

        def set_key(key: str, value: str) -> None:
            # Store using a simple "OBJECT.SUBKEY" namespace when inside OBJECT blocks
            if current_object_stack:
                ns_key = ".".join(current_object_stack + [key])
                raw[ns_key] = value
            else:
                raw[key] = value

        for line in lines:
            line = line.strip()
            if not line or line.startswith("/*"):
                continue

            # OBJECT / END_OBJECT handling
            if line.startswith("OBJECT"):
                m = self._kv_re.match(line)
                if m:
                    obj_name = m.group(2).strip().strip('"')
                    current_object_stack.append(obj_name)
                continue

            if line.startswith("END_OBJECT"):
                if current_object_stack:
                    current_object_stack.pop()
                continue

            m = self._kv_re.match(line)
            if m:
                k, v = m.group(1), m.group(2)
                set_key(k, v)

        # Helper to read with fallbacks
        def pick(*keys: str) -> Optional[str]:
            for k in keys:
                if k in raw:
                    return str(raw[k])
            return None

        def to_int(x: Optional[str]) -> Optional[int]:
            if x is None:
                return None
            x = x.strip().strip('"')
            try:
                return int(x)
            except ValueError:
                return None

        def to_float(x: Optional[str]) -> Optional[float]:
            if x is None:
                return None
            x = x.strip().strip('"')
            try:
                return float(x)
            except ValueError:
                return None

        def pick_any(*keys: str) -> Optional[str]:
            # 1) exact key match
            for k in keys:
                if k in raw:
                    return str(raw[k])

            # 2) suffix match (handles UNCOMPRESSED_FILE.IMAGE.LINES etc.)
            suffixes = [f".{k.split('.')[-1]}" for k in keys]
            for suf in suffixes:
                # prefer keys containing ".IMAGE."
                for kk, vv in raw.items():
                    if kk.endswith(suf) and ".IMAGE." in f".{kk}.":
                        return str(vv)
                # otherwise accept any suffix match
                for kk, vv in raw.items():
                    if kk.endswith(suf):
                        return str(vv)

            return None


        meta = PDSImageMeta(
            record_bytes=to_int(pick("RECORD_BYTES")),
            file_records=to_int(pick("FILE_RECORDS")),
            # Many PDS labels nest these under the IMAGE object (varies)
            lines=to_int(pick_any("IMAGE.LINES", "LINES")),
            line_samples=to_int(pick_any("IMAGE.LINE_SAMPLES", "LINE_SAMPLES")),
            sample_type=(lambda s: s.strip().strip('"') if s else None)(pick_any("IMAGE.SAMPLE_TYPE", "SAMPLE_TYPE")),
            sample_bits=to_int(pick_any("IMAGE.SAMPLE_BITS", "SAMPLE_BITS")),
            scaling_factor=to_float(pick_any("IMAGE.SCALING_FACTOR", "SCALING_FACTOR")),
            offset=to_float(pick_any("IMAGE.OFFSET", "OFFSET")),
            missing_constant=to_float(pick_any("IMAGE.MISSING_CONSTANT", "MISSING_CONSTANT")),
            raw=raw,
        )

        return meta

    '''
    ----------------------------
    IMG loading (float)
    ----------------------------
    '''
    def _numpy_dtype_from_sample_type(self, sample_type: str, sample_bits: int) -> np.dtype:
        """
        Maps PDS SAMPLE_TYPE/SAMPLE_BITS to numpy dtype.
        For LOLA float_img tiles, this is commonly 32-bit float (endianness matters).
        """
        st = (sample_type or "").upper()

        # Typical PDS3 values: IEEE_REAL, PC_REAL, MSB_INTEGER, LSB_INTEGER, etc.
        # We'll handle the common float cases.
        if "REAL" in st or "FLOAT" in st:
            if sample_bits == 32:
                # Endianness: PC_REAL usually little-endian; IEEE_REAL may be big-endian in some datasets.
                # We'll default to little-endian if 'PC' is present; otherwise native.
                if "PC" in st or "LSB" in st:
                    return np.dtype("<f4")
                if "MSB" in st:
                    return np.dtype(">f4")
                return np.dtype("f4")
            if sample_bits == 64:
                if "PC" in st or "LSB" in st:
                    return np.dtype("<f8")
                if "MSB" in st:
                    return np.dtype(">f8")
                return np.dtype("f8")

        raise ValueError(f"Unsupported SAMPLE_TYPE/SAMPLE_BITS: {sample_type}/{sample_bits}")

    def load_tile(self, tile_id: str, force_download: bool = False) -> RasterTile:
        """
        Ensures files are present, parses label, reads IMG into numpy, returns RasterTile.
        """
        img_path, lbl_path = self.ensure_downloaded(tile_id, force=force_download)
        meta = self.parse_lbl(lbl_path)

        if meta.lines is None or meta.line_samples is None:
            raise ValueError(f"LBL missing LINES/LINE_SAMPLES for tile '{tile_id}'.")

        print(meta.lines, meta.line_samples, meta.sample_type, meta.sample_bits)
        print([k for k in meta.raw.keys() if k.endswith(".LINES") or k.endswith(".LINE_SAMPLES")])

        # Determine dtype
        if meta.sample_type is None or meta.sample_bits is None:
            raise ValueError(f"LBL missing SAMPLE_TYPE/SAMPLE_BITS for tile '{tile_id}'.")

        dtype = self._numpy_dtype_from_sample_type(meta.sample_type, meta.sample_bits)

        # Read binary
        expected = meta.lines * meta.line_samples
        arr = np.fromfile(img_path, dtype=dtype, count=expected)

        if arr.size != expected:
            raise IOError(
                f"Read {arr.size} samples, expected {expected} for tile '{tile_id}'. "
                f"File may be truncated or label mismatch."
            )

        arr = arr.reshape((meta.lines, meta.line_samples))

        # Apply missing constant if present (turn into NaN)
        if meta.missing_constant is not None:
            arr = arr.astype(np.float32, copy=False)
            arr[arr == meta.missing_constant] = np.nan

        # Apply scaling/offset if present (many DEM products are already in meters as float, but keep hook)
        if meta.scaling_factor is not None:
            arr = arr * float(meta.scaling_factor)
        if meta.offset is not None:
            arr = arr + float(meta.offset)

        # Convert km -> m if UNIT is kilometer
        unit = (meta.raw or {}).get("UNCOMPRESSED_FILE.IMAGE.UNIT") or (meta.raw or {}).get("IMAGE.UNIT") or (meta.raw or {}).get("UNIT")
        if unit and "KILOMETER" in str(unit).upper():
            arr = arr * 1000.0
            if meta.offset is not None:
                meta.offset = meta.offset * 1000.0


        return RasterTile(tile_id=tile_id, data=arr, meta=meta)

    '''
    --------------------------
     ROI hooks (to implement next)
    -----------------------------
    '''
    def extract_roi_pixels(
        self,
        tile: RasterTile,
        row_min: int,
        row_max: int,
        col_min: int,
        col_max: int,
    ) -> np.ndarray:
        """
        Simple pixel ROI extraction. This is guaranteed safe and fast.
        Geospatial ROI (lat/lon bounding box) can be added later once we pin the projection details.
        """
        return tile.data[row_min:row_max, col_min:col_max]

    '''
    --------------
    Debug / provenance helpers
    --------------
    '''

    def tile_provenance(self, tile_id: str) -> Dict[str, Any]:
        """
        Returns URLs + local paths so you can log them for reproducibility.
        """
        if tile_id not in self.tiles:
            raise KeyError(f"Tile '{tile_id}' is not registered.")
        img_path, lbl_path = self._local_paths(tile_id)
        spec = self.tiles[tile_id]
        return {
            "tile_id": tile_id,
            "img_url": spec.img_url,
            "lbl_url": spec.lbl_url,
            "local_img": str(img_path),
            "local_lbl": str(lbl_path),
        }





In [31]:
BASE = "https://pds-geosciences.wustl.edu/lro/lro-l-lola-3-rdr-v1/lrolol_1xxx/data/lola_gdr/polar/float_img"

tiles = [
    LOLATileSpec(
        tile_id="ldem_85n_20m_float",
        img_url=f"{BASE}/ldem_85n_20m_float.img",
        lbl_url=f"{BASE}/ldem_85n_20m_float.lbl",
    ),
    LOLATileSpec(
        tile_id="ldem_85s_20m_float",
        img_url=f"{BASE}/ldem_85s_20m_float.img",
        lbl_url=f"{BASE}/ldem_85s_20m_float.lbl",
    ),
    LOLATileSpec(
        tile_id="ldem_875n_20m_float",
        img_url=f"{BASE}/ldem_875n_20m_float.img",
        lbl_url=f"{BASE}/ldem_875n_20m_float.lbl",
    ),
    LOLATileSpec(
        tile_id="ldem_875s_20m_float",
        img_url=f"{BASE}/ldem_875s_20m_float.img",
        lbl_url=f"{BASE}/ldem_875s_20m_float.lbl",
    ),
]

dh = DataHandler(cache_dir="/content/endymion_cache/lola")
dh.register_tiles(tiles)


tile = dh.load_tile("ldem_85s_20m_float")
print(tile.tile_id, tile.data.shape, tile.meta.sample_type, tile.meta.sample_bits)


15168 15168 PC_REAL 32
['UNCOMPRESSED_FILE.IMAGE.LINES', 'UNCOMPRESSED_FILE.IMAGE.LINE_SAMPLES']
ldem_85s_20m_float (15168, 15168) PC_REAL 32


##debugging

In [32]:
# --- Debugging block to inspect the problematic label file ---
tile_id_to_debug = "ldem_85s_20m_float"
img_path, lbl_path = dh.ensure_downloaded(tile_id_to_debug, force=True)
print(f"\n--- Content of {lbl_path} ---")
print(lbl_path.read_text(encoding="utf-8", errors="ignore"))
print(f"--- End of {lbl_path} content ---\n")
# --- End Debugging block ---

KeyboardInterrupt: 

In [33]:
print(DataHandler)
print("Has load_tile:", hasattr(DataHandler, "load_tile"))
print("Methods:", [m for m in dir(DataHandler) if "tile" in m.lower()])


<class '__main__.DataHandler'>
Has load_tile: True
Methods: ['load_tile', 'register_tile', 'register_tiles', 'tile_provenance']
