In [1]:
from dataclasses import dataclass
from typing import List, Optional, Dict, Any, Tuple

@dataclass
class GeoAsset:
    id: str
    title: str
    abstract: Optional[str]
    keywords: List[str]
    bbox: Optional[Tuple[float, float, float, float]]  # minx, miny, maxx, maxy (lon/lat)
    datetime: Optional[Tuple[Optional[str], Optional[str]]]  # (start, end) ISO8601
    license: Optional[str]
    links: Dict[str, str]  # {"landing": url, "api": url, "download": url, ...}
    source: str           # "stac", "ogc-records", "ckan", "cmr"
    provider: Optional[str]


In [8]:
# =========================
# Normalized schema & utils
# =========================
from dataclasses import dataclass
from typing import List, Optional, Dict, Any, Tuple
import json, re, requests
from requests.adapters import HTTPAdapter, Retry

@dataclass
class GeoAsset:
    id: str
    title: str
    abstract: Optional[str]
    keywords: List[str]
    bbox: Optional[Tuple[float, float, float, float]]  # (minx, miny, maxx, maxy) lon/lat
    datetime: Optional[Tuple[Optional[str], Optional[str]]]  # (start, end) ISO8601
    license: Optional[str]
    links: Dict[str, str]  # {"landing": url, "api": url, "download": url, ...}
    source: str            # "stac" | "ogc-records" | "ckan" | "cmr"
    provider: Optional[str]

def _session(timeout=12) -> requests.Session:
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.4,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["HEAD","GET","OPTIONS","POST"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    # wrap to enforce default timeout
    orig_request = s.request
    def _req(method, url, **kw):
        kw.setdefault("timeout", timeout)
        return orig_request(method, url, **kw)
    s.request = _req  # type: ignore
    return s

def _norm_bbox(bbox) -> Optional[Tuple[float,float,float,float]]:
    if not bbox: return None
    if isinstance(bbox, dict) and "bbox" in bbox: bbox = bbox["bbox"]
    if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
        return (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
    # try GeoJSON Polygon bbox
    if isinstance(bbox, dict) and bbox.get("type") == "Polygon":
        coords = bbox["coordinates"][0]
        xs = [c[0] for c in coords]; ys = [c[1] for c in coords]
        return (min(xs), min(ys), max(xs), max(ys))
    return None

def _dt_range_from_props(props: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
    start = props.get("start_datetime") or props.get("datetime")
    end   = props.get("end_datetime") or start
    return (start, end)

def _parse_bbox_from_ckan_spatial(spatial: Optional[str]) -> Optional[Tuple[float,float,float,float]]:
    # Handles GeoJSON string and WKT POLYGON((...))
    if not spatial: return None
    try:
        obj = json.loads(spatial)
        if isinstance(obj, dict):
            if "bbox" in obj: return _norm_bbox(obj["bbox"])
            if obj.get("type") in ("Polygon","MultiPolygon"):
                return _norm_bbox(obj)
    except Exception:
        pass
    # crude WKT parser (POLYGON((x y, ...)))
    m = re.search(r"POLYGON\s*\(\((.*?)\)\)", spatial, re.I)
    if m:
        pts = [p.strip() for p in m.group(1).split(",")]
        xs, ys = [], []
        for p in pts:
            parts = p.split()
            if len(parts) >= 2:
                xs.append(float(parts[0])); ys.append(float(parts[1]))
        if xs and ys:
            return (min(xs), min(ys), max(xs), max(ys))
    return None

# =================
# STAC (pystac-client)
# =================
def search_stac(endpoint: str,
                q: Optional[str] = None,
                bbox: Optional[Tuple[float,float,float,float]] = None,
                time_range: Optional[Tuple[Optional[str],Optional[str]]] = None,
                collections: Optional[List[str]] = None,
                limit: int = 10) -> List[GeoAsset]:
    from pystac_client import Client  # pip install pystac-client
    client = Client.open(endpoint)

    kw: Dict[str, Any] = {"max_items": limit}
    if bbox: kw["bbox"] = list(bbox)
    if time_range:
        start, end = time_range
        kw["datetime"] = f"{start or '..'}/{end or '..'}"
    if collections: kw["collections"] = collections

    # Check CQL2 support properly (conformance URLs)
    supports_filter = False
    if hasattr(client, "conforms_to"):
        supports_filter = (
            client.conforms_to("https://api.stacspec.org/v1.0.0/item-search#filter")
            or client.conforms_to("http://www.opengis.net/spec/cql2/1.0/conf/cql2-text")
            or client.conforms_to("http://www.opengis.net/spec/cql2/1.0/conf/cql2-json")
        )

    # If supported, add a conservative text filter
    if q and supports_filter:
        kw["filter_lang"] = "cql2-text"
        kw["filter"] = f"(title ILIKE '%{q}%') OR (description ILIKE '%{q}%')"

    # Perform search with graceful fallback if server dislikes the filter/queryables
    try:
        search = client.search(**kw)
    except Exception:
        kw.pop("filter", None); kw.pop("filter_lang", None)
        search = client.search(**kw)

    assets: List[GeoAsset] = []
    for it in search.items():
        # safer link extraction
        landing = getattr(it, "get_self_href", lambda: None)() or ""
        props = it.properties or {}
        # collect keywords from item + (optional) collection
        kwds = set(props.get("keywords") or [])
        try:
            coll = it.get_collection()
        except Exception:
            coll = None
        if coll:
            try:
                for k in (coll.keywords or []): kwds.add(k)
            except Exception:
                pass
        # license from item, else collection
        license_ = props.get("license")
        if not license_ and coll:
            try: license_ = coll.license
            except Exception: pass

        assets.append(GeoAsset(
            id=it.id,
            title=props.get("title") or it.id,
            abstract=props.get("description"),
            keywords=list(kwds),
            bbox=_norm_bbox(getattr(it, "bbox", None)),
            datetime=_dt_range_from_props(props),
            license=license_,
            links={"landing": landing, "api": endpoint},
            source="stac",
            provider=(getattr(coll.providers[0], "name", None) if coll and getattr(coll, "providers", None) else None)
        ))
    return assets

# =======================
# OGC API – Records (Core)
# =======================
def search_ogc_records(base: str,
                       q: Optional[str] = None,
                       bbox: Optional[Tuple[float,float,float,float]] = None,
                       time_range: Optional[Tuple[Optional[str],Optional[str]]] = None,
                       limit: int = 10) -> List[GeoAsset]:
    s = _session()
    out: List[GeoAsset] = []

    # Try /search (JSON body per Part 1 Core)
    body: Dict[str, Any] = {"limit": limit}
    if q: body["q"] = q
    if bbox: body["bbox"] = list(bbox)
    if time_range:
        start, end = time_range
        body["datetime"] = f"{start or '..'}/{end or '..'}"

    r = s.post(f"{base.rstrip('/')}/search", json=body, headers={"accept":"application/geo+json"})
    features: List[Dict[str, Any]] = []
    if r.ok:
        features = r.json().get("features", [])
    else:
        # Fallback: probe a couple of collections’ items
        rc = s.get(f"{base.rstrip('/')}/collections", headers={"accept":"application/json"})
        colls = (rc.json().get("collections", []) if rc.ok else [])[:2]
        for c in colls:
            p = s.get(f"{base.rstrip('/')}/collections/{c['id']}/items",
                      params={"limit": limit}, headers={"accept":"application/geo+json"})
            if p.ok:
                features += p.json().get("features", [])

    for f in features[:limit]:
        props = f.get("properties", {}) or {}
        exbbox = f.get("bbox") or props.get("bbox")
        start, end = props.get("datetime"), props.get("end_datetime")
        links = props.get("links") or f.get("links") or []
        landing = ""
        if isinstance(links, list) and links:
            landing = links[0].get("href", "") or ""
        out.append(GeoAsset(
            id=str(f.get("id") or props.get("identifier") or props.get("id")),
            title=props.get("title") or props.get("name") or str(f.get("id")),
            abstract=props.get("description"),
            keywords=props.get("keywords") or [],
            bbox=_norm_bbox(exbbox),
            datetime=(start, end),
            license=props.get("license"),
            links={"landing": landing, "api": base},
            source="ogc-records",
            provider=props.get("publisher") or props.get("provider")
        ))
    return out

# =================
# CKAN (e.g., Data.gov)
# =================
def search_ckan(base: str,
                api_key: Optional[str] = None,
                q: str = "",
                limit: int = 10) -> List[GeoAsset]:
    s = _session()
    params = {"q": q, "rows": limit}
    headers = {"X-Api-Key": api_key} if api_key else {}
    r = s.get(f"{base.rstrip('/')}/package_search", params=params, headers=headers)
    r.raise_for_status()
    res = r.json().get("result", {}).get("results", [])
    out: List[GeoAsset] = []
    for pkg in res:
        spatial = pkg.get("spatial")
        bbox = _parse_bbox_from_ckan_spatial(spatial) if spatial else None
        # pick best landing link: package URL or first resource
        landing = pkg.get("url") or (pkg.get("resources",[{}])[0].get("url","") if pkg.get("resources") else "")
        # CKAN tags are list of dicts in many portals
        tags = pkg.get("tags", [])
        if tags and isinstance(tags[0], dict):
            tags = [t.get("name","") for t in tags]
        out.append(GeoAsset(
            id=pkg["id"],
            title=pkg.get("title") or pkg.get("name"),
            abstract=pkg.get("notes"),
            keywords=[t for t in tags if t],
            bbox=bbox,
            datetime=(pkg.get("temporal_start"), pkg.get("temporal_end")),
            license=pkg.get("license_title") or pkg.get("license_id"),
            links={"landing": landing,
                   "api": f"{base.rstrip('/')}/package_show?id={pkg['id']}"},
            source="ckan",
            provider=(pkg.get("organization", {}) or {}).get("title")
        ))
    return out

# =========================
# NASA CMR (Collections API)
# =========================
def search_cmr_collections(q: Optional[str] = None,
                           bbox: Optional[Tuple[float,float,float,float]] = None,
                           time_range: Optional[Tuple[Optional[str],Optional[str]]] = None,
                           limit: int = 10) -> List[GeoAsset]:
    s = _session()
    params: Dict[str, Any] = {"page_size": limit, "include_has_granules": "true"}
    if q: params["keyword"] = q
    if bbox: params["bounding_box"] = ",".join(map(str, bbox))  # minx,miny,maxx,maxy
    if time_range:
        start, end = time_range
        if start or end:
            params["temporal"] = f"{start or ''},{end or ''}"
    r = s.get("https://cmr.earthdata.nasa.gov/search/collections.json", params=params)
    r.raise_for_status()
    cols = r.json().get("feed", {}).get("entry", []) or []
    out: List[GeoAsset] = []
    for c in cols:
        box = None
        if c.get("boxes"):  # "minlat minlon maxlat maxlon"
            try:
                minlat, minlon, maxlat, maxlon = map(float, c["boxes"][0].split())
                box = (minlon, minlat, maxlon, maxlat)
            except Exception:
                pass
        # choose a reasonable landing href if present
        landing = ""
        for lk in c.get("links", []):
            if lk.get("rel","").endswith("/data#") or lk.get("rel","").endswith("/documentation#") or lk.get("href"):
                landing = lk.get("href",""); break
        out.append(GeoAsset(
            id=c["id"],
            title=c.get("dataset_id") or c.get("short_name") or c["id"],
            abstract=c.get("summary"),
            keywords=[", ".join(k.values()) if isinstance(k, dict) else str(k) for k in (c.get("science_keywords") or [])],
            bbox=box,
            datetime=(c.get("time_start"), c.get("time_end")),
            license=None,
            links={"landing": landing, "api": "https://cmr.earthdata.nasa.gov/search/"},
            source="cmr",
            provider=(c.get("archive_center") or c.get("data_center"))
        ))
    return out

# =========
# Router
# =========
def discover(query: str = "",
             bbox: Optional[Tuple[float,float,float,float]] = None,
             time_range: Optional[Tuple[Optional[str],Optional[str]]] = None,
             limit: int = 6,
             providers: Dict[str, Any] = None) -> List[GeoAsset]:
    providers = providers or dict(
        stac=["https://planetarycomputer.microsoft.com/api/stac/v1"],
        records=[],  # add your OGC API–Records endpoints here
        ckan=[("https://api.gsa.gov/technology/datagov/v3/action", None)],  # requires API key for higher limits
        cmr=True
    )
    results: List[GeoAsset] = []

    for ep in providers.get("stac", []):
        try:
            results += search_stac(ep, q=query, bbox=bbox, time_range=time_range, limit=limit)
        except Exception:
            pass

    for ep in providers.get("records", []):
        try:
            results += search_ogc_records(ep, q=query, bbox=bbox, time_range=time_range, limit=limit)
        except Exception:
            pass

    for base, key in providers.get("ckan", []):
        try:
            results += search_ckan(base, api_key=key, q=query, limit=limit)
        except Exception:
            pass

    if providers.get("cmr"):
        try:
            results += search_cmr_collections(query, bbox=bbox, time_range=time_range, limit=limit)
        except Exception:
            pass

    # return at most limit * number_of_sources results (rough cap)
    max_res = limit * (bool(providers.get("stac")) + len(providers.get("records", [])) +
                       len(providers.get("ckan", [])) + int(bool(providers.get("cmr"))))
    return results[:max(1, max_res)]

# =========
# Scorer
# =========
def score(asset: GeoAsset,
          query_terms: List[str],
          bbox: Optional[Tuple[float,float,float,float]] = None,
          time_range: Optional[Tuple[Optional[str],Optional[str]]] = None) -> float:
    text = (" ".join([
        asset.title or "",
        asset.abstract or "",
        " ".join(asset.keywords or [])
    ])).lower()
    text_score = sum(1.0 for t in query_terms if t and t.lower() in text)

    st_score = 0.0
    if bbox and asset.bbox:
        ax1, ay1, ax2, ay2 = asset.bbox; bx1, by1, bx2, by2 = bbox
        inter_w = max(0.0, min(ax2, bx2) - max(ax1, bx1))
        inter_h = max(0.0, min(ay2, by2) - max(ay1, by1))
        inter = inter_w * inter_h
        if inter > 0:
            area = (ax2-ax1)*(ay2-ay1) + (bx2-bx1)*(by2-by1) - inter
            if area > 0: st_score += inter / area

    # small bump if dataset has any time bounds when user set a time range
    if time_range and asset.datetime and (asset.datetime[0] or asset.datetime[1]):
        st_score += 0.2

    license_bonus = 0.2 if (asset.license and "by" in asset.license.lower()) else 0.0
    return text_score + st_score + license_bonus


In [3]:
def discover(query:str="", bbox=None, time_range=None, limit=6,
             providers=dict(
                 stac=[ "https://planetarycomputer.microsoft.com/api/stac/v1" ],  # :contentReference[oaicite:9]{index=9}
                 records=[ # add your Records endpoints here
                 ],
                 ckan=[("https://api.gsa.gov/technology/datagov/v3/action", "YOUR_KEY")],  # :contentReference[oaicite:10]{index=10}
                 cmr=True
             )):
    results = []
    # STAC
    for ep in providers.get("stac", []):
        results += search_stac(ep, q=query, bbox=bbox, time_range=time_range, limit=limit)
    # OGC Records
    for ep in providers.get("records", []):
        results += search_ogc_records(ep, q=query, bbox=bbox, time_range=time_range, limit=limit)
    # CKAN
    for base,key in providers.get("ckan", []):
        results += search_ckan(base, api_key=key, q=query, limit=limit)
    # CMR
    if providers.get("cmr"): results += search_cmr_collections(query, bbox=bbox, time_range=time_range, limit=limit)
    return results[: (limit*4)]


In [4]:
def score(asset: GeoAsset, query_terms: List[str], bbox=None, time_range=None):
    text = (asset.title + " " + (asset.abstract or "") + " " + " ".join(asset.keywords)).lower()
    text_score = sum(1 for t in query_terms if t in text)
    st_score = 0
    if bbox and asset.bbox:
        # reward overlap (very rough IoU approximation)
        ax1, ay1, ax2, ay2 = asset.bbox; bx1, by1, bx2, by2 = bbox
        inter = max(0, min(ax2,bx2)-max(ax1,bx1)) * max(0, min(ay2,by2)-max(ay1,by1))
        area = (ax2-ax1)*(ay2-ay1) + (bx2-bx1)*(by2-by1) - inter
        if area>0: st_score += inter/area
    if time_range and asset.datetime and asset.datetime[0]:
        # reward recency if end date exists
        st_score += 0.2
    license_bonus = 0.2 if (asset.license and "by" in asset.license.lower()) else 0
    return text_score + st_score + license_bonus


In [22]:
def print_assets(assets):
    def fmt_time(dt):
        if not dt: return "—"
        s, e = dt
        s = s.split("T")[0] if s else "—"
        e = e.split("T")[0] if e else "—"
        return f"{s} → {e}"
    for i, a in enumerate(assets, 1):
        print(f"{i}) {a.title}")
        print(f"   Provider: {a.provider or '—'} · Source: {a.source}")
        print(f"   BBOX: {a.bbox or '—'}")
        print(f"   Time: {fmt_time(a.datetime)}")
        print(f"   Landing: {a.links.get('landing','—')}")
        if a.abstract:
            snippet = " ".join(a.abstract.split())[:220]
            print(f"   Notes: {snippet}{'…' if len(a.abstract)>220 else ''}")
        print()


In [23]:
q      = "land cover for Illinois"
bbox   = (-91.6, 36.9, -87.4, 42.5)           # IL envelope
timer  = ("2020-01-01", "2025-11-06")         # resolved range
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
print_assets(cards)

1) MODIS/Terra Near Real Time (NRT) Calibrated Radiances 5-Min L1B Swath 1km
   Provider: NASA/GSFC/EOS/ESDIS/LANCEMODIS · Source: cmr
   KEYWORDS: —
   BBOX: (-180.0, -90.0, 180.0, 90.0)
   Time: 2015-12-06 → —
   Landing: https://earthdata.nasa.gov/earth-observation-data/near-real-time/download-nrt-data/modis-nrt
   Notes: The MODIS Level 1B Near Real Time (NRT) data set contains calibrated and geolocated at-aperture radiances for 36 discrete bands located in the 0.4 to 14.4 micron region of electromagentic spectrum. These data are generat…

2) MODIS/Terra Near Real Time (NRT) Calibrated Radiances 5-Min L1B Swath 500m
   Provider: NASA/GSFC/EOS/ESDIS/LANCEMODIS · Source: cmr
   KEYWORDS: —
   BBOX: (-180.0, -90.0, 180.0, 90.0)
   Time: 2015-12-06 → —
   Landing: https://earthdata.nasa.gov/earth-observation-data/near-real-time/download-nrt-data/modis-nrt
   Notes: The 500 meter MODIS Level 1B Near Real Time (NRT) data set contains calibrated and geolocated at-aperture radiances for 7 

In [11]:
q      = "spatial computing and geospatial artificial intelligence applications"
bbox   = (-125.0, 25.0, -66.5, 49.5)  # Continental U.S.
timer  = ("2020-01-01", "2025-11-06")
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
cards

[]

In [24]:
q      = "land cover and land use change analysis"
bbox   = (-125.0, 25.0, -66.5, 49.5)  # U.S. coverage
timer  = ("2010-01-01", "2025-11-06")
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
print_assets(cards)

1) VEMAP 2: Annual Ecosystem Model Responses to U.S. Climate Change, 1994-2100
   Provider: ORNL_DAAC · Source: cmr
   KEYWORDS: —
   BBOX: (-124.0, 26.0, -66.0, 50.0)
   Time: 1994-01-01 → 2100-12-31
   Landing: https://search.earthdata.nasa.gov/search?q=vemap-2_results_annual_766&ac=true
   Notes: The Vegetation-Ecosystem Modeling and Analysis Project (VEMAP) was a large, collaborative, multi-institutional, international effort whose goal was to evaluate the sensitivity of terrestrial ecosystem and vegetation proc…

2) VEMAP 2: Monthly Ecosystem Model Responses to U.S. Climate Change, 1994-2100
   Provider: ORNL_DAAC · Source: cmr
   KEYWORDS: —
   BBOX: (-124.0, 26.0, -66.0, 50.0)
   Time: 1994-01-01 → 2100-12-31
   Landing: https://search.earthdata.nasa.gov/search?q=vemap-2_results_monthly_767&ac=true
   Notes: The Vegetation-Ecosystem Modeling and Analysis Project (VEMAP) was a large, collaborative, multi-institutional, international effort whose goal was to evaluate the sensitivi

In [14]:
# Water sustainability and land use change in Upper Mississippi Basin
q      = "land use change and climate variability impacts on water availability in Upper Mississippi River Basin"
bbox   = (-96.5, 40.0, -89.0, 47.5)   # Upper Mississippi River Basin extent
timer  = ("2000-01-01", "2025-11-06")

cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
cards

[]

In [21]:
q      = "dam impact on river hydrology and land cover change"
bbox   = (-125.0, 25.0, -66.5, 49.5)   # continental U.S.
timer  = ("1980-01-01", "2025-11-06")
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
cards

[]

In [19]:
%pip install pystac_client

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [41]:
import os, json, re, time, requests
from typing import Optional, Tuple, List, Dict, Any

class NLQueryError(Exception):
    pass

def _iso_date(s: Optional[str]) -> Optional[str]:
    if not s: return None
    s = s.strip()
    # Accept YYYY-MM-DD (optionally with time); coerce to YYYY-MM-DD
    m = re.match(r"^\s*(\d{4})-(\d{2})-(\d{2})", s)
    return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None

def _valid_bbox(b: Optional[List[float]]) -> Optional[Tuple[float,float,float,float]]:
    if not b or len(b) < 4: return None
    x1,y1,x2,y2 = map(float, b[:4])
    if not (-180.0 <= x1 <= 180.0 and -180.0 <= x2 <= 180.0 and -90.0 <= y1 <= 90.0 and -90.0 <= y2 <= 90.0):
        return None
    if x2 <= x1 or y2 <= y1:
        return None
    return (x1,y1,x2,y2)

def get_q_bbox_timer_openai(
    user_query: str,
    *,
    current_date: str,                         # e.g., "2025-11-06"
    api_base: str = "https://anvilgpt.rcac.purdue.edu/api/v1",
    api_key: "sk-",
    model: str = "gpt-oss:120b",
    timeout: int = 20,
    default_bbox: Optional[Tuple[float,float,float,float]] = None,   # e.g., continental US
    default_timer: Optional[Tuple[Optional[str],Optional[str]]] = None, # e.g., ("2000-01-01", current_date)
    max_retries: int = 2
) -> Tuple[str, Optional[Tuple[float,float,float,float]], Optional[Tuple[Optional[str],Optional[str]]]]:
    """
    Convert a natural-language query into (q, bbox, timer) using an OpenAI-compatible chat endpoint.

    Returns:
        q:      str  — refined text query for metadata search
        bbox:   (minlon, minlat, maxlon, maxlat) or None
        timer:  (start_iso, end_iso) or None (each may be None)

    Notes:
      - If the model can't infer bbox or dates confidently, it may return null; we then fall back to defaults (if provided).
      - 'current_date' is used to resolve relative phrases like "last 3 years".
    """
    api_key = api_key or os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise NLQueryError("Missing API key (set OPENAI_API_KEY or pass api_key=).")

    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

    system_prompt = f"""
You are a geospatial query normalizer.

Given a user’s natural-language question, produce:
- q: a concise keyword query (~3–12 words) that captures the theme/topic only (no place names, no dates).
- bbox: [minlon, minlat, maxlon, maxlat] in EPSG:4326 if a place/region is implied; else null.
- timer: [start_iso, end_iso] where dates are YYYY-MM-DD. Resolve relative dates using today = {current_date}. 
  If no time window is implied, return [null, null].

Extraction rules:
- Split theme vs. location: Remove toponyms (countries, states, provinces, cities, rivers, basins, parks, etc.) 
  from q and reflect them only in bbox.
  Example: "dams in illinois" → q: "dams" and bbox: <Illinois bbox>.
- Split theme vs. time: Remove temporal phrases from q and reflect them only in timer.
  Example: "since 2010", "2018–2020", "last 3 years", "recent".
- Normalize the theme:
  • Keep core nouns/verbs (e.g., "dam safety", "land cover change", "streamflow trends").
  • Prefer base forms; avoid overly broad filler words.
  • Keep domain modifiers that are not locations or dates (e.g., "aging", "sedimentation", "hydropower").
- Bounding box:
  • Prefer the smallest reasonable bbox that contains the named place (city/county/state/basin if specified; 
    country/continent if broad).
  • If multiple places are mentioned and a single bbox is required, return the bbox of their combined extent 
    (min enclosing rectangle). If dominance is clear (e.g., "X near Y"), use the dominant place.
  • Never hallucinate: only output a bbox if you are reasonably certain. Otherwise, return null.
- Time normalization:
  • "recent" ⇒ last 5 years from today.
  • "last N years" ⇒ [today − N years + 1 day, today].
  • Single year "YYYY" ⇒ ["YYYY-01-01","YYYY-12-31"].
  • Range "YYYY–YYYY" ⇒ ["start-01-01","end-12-31"].
- Output must be valid JSON exactly matching the schema (no extra keys, no commentary).

Output schema (must match exactly):
{{
  "q": "string",
  "bbox": [minlon, minlat, maxlon, maxlat] | null,
  "timer": [start_iso_or_null, end_iso_or_null]
}}

Mini examples (for behavior only; do not copy text):
- "dams in illinois" → q: "dams", bbox: <Illinois bbox>, timer: [null,null]
- "wildfire risk near Sacramento 2018–2020" → q: "wildfire risk", bbox: <Sacramento metro bbox>, timer: ["2018-01-01","2020-12-31"]
- "recent land cover change in Kenya" → q: "land cover change", bbox: <Kenya bbox>, timer: [today−5y+1d, today]
- "streamflow trends" (no place, no time) → q: "streamflow trends", bbox: null, timer: [null,null]
"""


    # JSON schema for strict mode
    json_schema = {
        "name": "GeoQuery",
        "schema": {
            "type": "object",
            "properties": {
                "q": {"type": "string"},
                "bbox": {
                    "type": ["array", "null"],
                    "items": {"type": "number"},
                    "minItems": 4, "maxItems": 4
                },
                "timer": {
                    "type": ["array", "null"],
                    "items": {"type": ["string", "null"]},
                    "minItems": 2, "maxItems": 2
                }
            },
            "required": ["q", "bbox", "timer"],
            "additionalProperties": False
        }
    }

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query}
    ]

    body_strict = {
        "model": model,
        "messages": messages,
        "temperature": 0,
        "response_format": {"type": "json_schema", "json_schema": json_schema}
    }

    body_fallback = {
        "model": model,
        "messages": messages + [
            {"role":"system","content":"Respond ONLY with JSON like {\"q\":\"...\",\"bbox\":[minlon,minlat,maxlon,maxlat|null],\"timer\":[start_iso,end_iso]}."}
        ],
        "temperature": 0
    }

    last_err = None
    for attempt in range(max_retries+1):
        try:
            use_body = body_strict if attempt == 0 else body_fallback
            resp = requests.post(f"{api_base.rstrip('/')}/chat/completions",
                                 headers=headers, json=use_body, timeout=timeout)
            resp.raise_for_status()
            content = resp.json()["choices"][0]["message"]["content"]
            data = json.loads(content)

            # Validate & normalize
            q = str(data.get("q","")).strip()
            if not q:
                raise NLQueryError("Model returned empty 'q'.")

            bbox = _valid_bbox(data.get("bbox"))
            timer_raw = data.get("timer")
            timer: Optional[Tuple[Optional[str],Optional[str]]] = None
            if isinstance(timer_raw, list) and len(timer_raw) >= 2:
                s = _iso_date(timer_raw[0])
                e = _iso_date(timer_raw[1])
                timer = (s, e)

            # Apply defaults if missing
            if bbox is None:
                bbox = default_bbox
            if (timer is None or (timer[0] is None and timer[1] is None)) and default_timer:
                timer = default_timer

            return q, bbox, timer

        except Exception as e:
            last_err = e
            # brief backoff, then try fallback mode or next attempt
            time.sleep(0.4)

    raise NLQueryError(f"Failed to parse query after retries: {last_err}")



In [28]:
# Example
q, bbox, timer = get_q_bbox_timer_openai(
    "dam impact on downstream flow regime in the Upper Colorado River since 2000",
    current_date="2025-11-06",
    api_base="https://anvilgpt.rcac.purdue.edu/api/v1",          # or your OpenAI-compatible base
    api_key="sk-" ,
    model="gpt-oss:120b",
    default_bbox=(-125.0, 25.0, -66.5, 49.5),      # fallback: CONUS
    default_timer=("2000-01-01", "2025-11-06")
)

# Then plug into your discovery:
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]


In [42]:

# Example
q, bbox, timer = get_q_bbox_timer_openai(
    "dams in Illinois",
    current_date="2025-11-06",
    api_base="https://anvilgpt.rcac.purdue.edu/api/v1",          # or your OpenAI-compatible base
    api_key="sk-" ,
    model="gpt-oss:120b",
    default_bbox=(-125.0, 25.0, -66.5, 49.5),      # fallback: CONUS
    default_timer=("2000-01-01", "2025-11-06")
)

# Then plug into your discovery:
cards  = discover(q, bbox, timer)
cards  = sorted(cards, key=lambda a: -score(a, q.lower().split(), bbox, timer))[:10]
print(q, bbox, timer)
print_assets(cards)

dams (-91.5131, 36.9703, -87.4948, 42.5083) ('2000-01-01', '2025-11-06')
1) Global Reservoir and Dam Database, Version 1 (GRanDv1): Dams, Revision 01
   Provider: ESDIS · Source: cmr
   KEYWORDS: —
   BBOX: (-153.03, -45.88, 176.82, 70.4)
   Time: 2011-01-01 → 2011-12-31
   Landing: https://sedac.ciesin.columbia.edu/data/set/grand-v1-dams-rev01/maps/services
   Notes: The Global Reservoir and Dam Database, Version 1, Revision 01 (v1.01) contains 6,862 records of reservoirs and their associated dams with a cumulative storage capacity of 6,197 cubic km. The dams were geospatially refere…

2) Global Reservoir and Dam Database, Version 1 (GRanDv1): Reservoirs, Revision 01
   Provider: ESDIS · Source: cmr
   KEYWORDS: —
   BBOX: (-153.04, -45.88, 176.83, 70.4)
   Time: 2011-01-01 → 2011-12-31
   Landing: https://sedac.ciesin.columbia.edu/data/set/grand-v1-reservoirs-rev01/maps/services
   Notes: Global Reservoir and Dam Database, Version 1, Revision 01 (v1.01) contains 6,862 records of rese

In [34]:
print(q, bbox, timer)

land cover land use change (-125.0, 25.0, -66.5, 49.5) ('2000-01-01', '2025-11-06')
