# Libraries

In [None]:
from __future__ import annotations

import json

from typing import Tuple
from datetime import datetime, timezone, timedelta

from urllib.parse import urlencode
from urllib.request import urlopen

---
# General Tools

## Global Variables

In [None]:
def global_variables():
    """
    This function defines and returns a dictionary of key variables used in the script.

    Returns:
        dict: A dictionary containing key variables for configuration and use across the script.
    """

    tiers = ["bronze", "silver", "gold"]
    VAR = {
        "containers": {tier: f"abfss://{tier}@alexccrv0dcn.dfs.core.windows.net/" for tier in tiers},
        "GFZ_BASE_URL": "https://kp.gfz-potsdam.de/app/json/",
        "NMDB_BASE_URL": "https://www.nmdb.eu/nest/draw_graph.php",

    }
    return VAR
VAR = global_variables()

## Support Functions 

In [None]:
def parse_utc(ts: str, is_start: bool, *, end_seconds: int = 59) -> datetime:
    """
    Normalize a UTC timestamp string into a timezone-aware `datetime`.

    Accepts either a date-only string (`YYYY-MM-DD`) or a UTC timestamp string
    (`YYYY-MM-DDTHH:MM:SSZ`). For date-only inputs, the time is expanded to a full
    day range depending on `is_start`.

    Args:
        ts (str): Timestamp string in `YYYY-MM-DD` or `YYYY-MM-DDTHH:MM:SSZ` format.
        is_start (bool): Whether `ts` represents a start boundary (00:00:00Z) or an
            end boundary (23:59:end_secondsZ) when `ts` is date-only.
        end_seconds (int): Second value used for the end boundary when `ts` is
            date-only. Defaults to 59.

    Returns:
        datetime: A timezone-aware datetime in UTC.

    Raises:
        ValueError: If `ts` is not in a supported format.
        ValueError: If `end_seconds` is outside 0..59 for date-only end boundaries.
    """
    # --- SETUP AND VALIDATION ---
    ts = ts.strip()

    if not (0 <= end_seconds <= 59):
        raise ValueError(f"end_seconds must be in 0..59. Got: {end_seconds!r}")

    # --- LOGIC ---
    if len(ts) == 10:
        base_dt = datetime.strptime(ts, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        if is_start:
            return base_dt.replace(hour=0, minute=0, second=0)
        return base_dt.replace(hour=23, minute=59, second=end_seconds)

    if ts.endswith("Z"):
        return datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)

    # --- RETURN ---
    raise ValueError(
        "Invalid timestamp format. Use 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ'. "
        f"Got: {ts!r}"
    )

def normalize_utc(ts: str, *, is_start: bool) -> str:
    """
    Normalize a timestamp string into UTC Zulu format: 'YYYY-MM-DDTHH:MM:SSZ'

    Uses your existing parse_utc(ts: str, is_start: bool) -> datetime, which accepts:
      - 'YYYY-MM-DD'
      - 'YYYY-MM-DDTHH:MM:SSZ'

    Rules:
      - If ts is date-only:
          start -> 00:00:00Z
          end   -> 23:59:59Z
      - If ts is full Zulu timestamp, it's preserved (normalized via parse_utc)

    Returns:
      str: UTC Zulu timestamp 'YYYY-MM-DDTHH:MM:SSZ'
    """
    dt = parse_utc(ts, is_start=is_start)
    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")


def write_single_file(dir_path: str, filename: str, text: str) -> str:
    """
    Write a *single* file to DBFS/ABFSS using Spark only.

    Spark writes to a directory (part-files), so we:
      1) write to a temp dir with coalesce(1)
      2) move the single part-* to the final target filename
      3) delete the temp dir
    """
    dbutils.fs.mkdirs(dir_path)
    target = f"{dir_path.rstrip('/')}/{filename}"

    # Best-effort remove existing target
    try:
        dbutils.fs.rm(target)
    except Exception:
        pass

    # Unique temp dir (avoid collisions); uses current UTC timestamp
    tmp_tag = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    tmp_dir = f"{dir_path.rstrip('/')}/_tmp_{filename}_{tmp_tag}"

    # Spark write (single partition -> single part-* file)
    df = spark.createDataFrame([(text,)], ["value"])
    df.coalesce(1).write.mode("overwrite").text(tmp_dir)

    # Find the part file and move it to the desired filename
    part_files = [x.path for x in dbutils.fs.ls(tmp_dir) if x.name.startswith("part-")]
    if not part_files:
        raise RuntimeError(f"No part-* file found in {tmp_dir}")

    dbutils.fs.mv(part_files[0], target, True)
    dbutils.fs.rm(tmp_dir, True)

    return target

### geomagnetic disturbance (GFZ)

In [None]:
def getKpindex(starttime: str, endtime: str,
               index: str,status: str = "all") -> Tuple[tuple, tuple, tuple]:
    """
    Download geomagnetic index data from GFZ Potsdam (kp.gfz-potsdam.de).

    Args:
        starttime: 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ' (UTC).
        endtime:   'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ' (UTC).
        index: One of: 'Kp', 'ap', 'Ap', 'Cp', 'C9', 'Hp30', 'Hp60',
               'ap30', 'ap60', 'SN', 'Fobs', 'Fadj'.
        status: 'all' or 'def' (definitive only). Only applicable to some indices.

    Returns:
        (datetime_values, index_values, status_values)
        - datetime_values: tuple of ISO timestamps (strings) from the API
        - index_values: tuple of values for the requested index
        - status_values: tuple of status strings, or () if the index has no status

    Raises:
        ValueError: on invalid parameters or time range.
        RuntimeError: on HTTP/JSON issues.
    """
    # --- SETUP AND VALIDATION ---
    allowed_indices = {
        "Kp", "ap", "Ap", "Cp", "C9", "Hp30", "Hp60", "ap30", "ap60", "SN", "Fobs", "Fadj"
    }
    no_status_indices = {"Hp30", "Hp60", "ap30", "ap60", "Fobs", "Fadj"}
    allowed_status = {"all", "def"}

    if index not in allowed_indices:
        raise ValueError(
            "Wrong index parameter. Allowed: "
            "'Kp','ap','Ap','Cp','C9','Hp30','Hp60','ap30','ap60','SN','Fobs','Fadj'. "
            f"Got: {index!r}"
        )

    if status not in allowed_status:
        raise ValueError("Wrong status parameter. Allowed: 'all', 'def'. Got: {!r}".format(status))

    # Start: 00:00:00Z if date-only; End: 23:59:59Z if date-only (full-day coverage)
    d1 = parse_utc(starttime, is_start=True)
    d2 = parse_utc(endtime, is_start=False, end_seconds=59)

    if d1 > d2:
        raise ValueError(f"Start time must be <= end time. Got: {d1.isoformat()} > {d2.isoformat()}")

    # --- LOGIC ---
    time_string = (
        f"start={d1.strftime('%Y-%m-%dT%H:%M:%SZ')}"
        f"&end={d2.strftime('%Y-%m-%dT%H:%M:%SZ')}"
    )
    url = f"{VAR['GFZ_BASE_URL']}?{time_string}&index={index}"

    if index not in no_status_indices and status == "def":
        url += "&status=def"

    try:
        with urlopen(url, timeout=30) as resp:
            payload = resp.read().decode("utf-8")
        data = json.loads(payload)
    except Exception as e:
        raise RuntimeError(f"Failed to fetch/parse GFZ response. URL={url!r}. Error: {e}") from e

    datetime_values = tuple(data.get("datetime", ()))
    index_values = tuple(data.get(index, ()))

    if index in no_status_indices:
        status_values = ()
    else:
        status_values = tuple(data.get("status", ()))
    
    # --- RETURN ---
    return datetime_values, index_values, status_values


### cosmic-ray variability (NMDB)

In [None]:
def getNMDBnest(starttime: str, endtime: str, stations:list):
    """
    Fetch NMDB NEST neutron monitor data as raw ASCII lines for a given UTC window.

    Accepts timestamps in either:
      - 'YYYY-MM-DD'
      - 'YYYY-MM-DDTHH:MM:SSZ'

    If a date-only value is provided:
      - starttime uses 00:00:00Z
      - endtime uses 23:59:59Z

    Args:
        starttime (str): Start time in 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ' (UTC).
        endtime (str): End time in 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ' (UTC).

    Returns:
        tuple[str, list[str]]: (final_url, response_lines)

    Raises:
        ValueError: If a timestamp format is invalid or endtime < starttime.
        urllib.error.URLError: If the HTTP request fails (network/DNS/etc.).
        urllib.error.HTTPError: If NMDB returns an HTTP error status.
    """
    # --- SETUP AND VALIDATION ---
    start_dt = parse_utc(starttime, is_start=True)
    end_dt = parse_utc(endtime, is_start=False)
    if end_dt < start_dt:
        raise ValueError("endtime must be >= starttime.")

    params = [
        ("formchk", "1"),
        ("stations[]", stations),
        ("tabchoice", "revori"),
        ("dtype", "corr_for_pressure"),
        ("tresolution", "30"),
        ("date_choice", "bydate"),

        ("start_year", f"{start_dt.year:04d}"),
        ("start_month", f"{start_dt.month:02d}"),
        ("start_day", f"{start_dt.day:02d}"),
        ("start_hour", f"{start_dt.hour:d}"),
        ("start_min", f"{start_dt.minute:d}"),

        ("end_year", f"{end_dt.year:04d}"),
        ("end_month", f"{end_dt.month:02d}"),
        ("end_day", f"{end_dt.day:02d}"),
        ("end_hour", f"{end_dt.hour:d}"),
        ("end_min", f"{end_dt.minute:d}"),

        ("output", "ascii"),
        ("yunits", "0"),
        ("anomalous", "1"),
        ("display_null", "1"),
    ]

    # --- LOGIC ---
    url = VAR["NMDB_BASE_URL"] + "?" + urlencode(params, doseq=True)
    
    with urlopen(url, timeout=30) as resp:
        text = resp.read().decode("utf-8", errors="replace")

    # --- RETURN ---
    return text

## Saving data

In [None]:
def save_gfz(startdate: str, enddate: str, idx:str = "Hp30") -> str:
    """
    Save GFZ Hp30 to a single CSV file in Bronze.

    Inputs:
      startdate, enddate: 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ'
        - If date-only: start is treated as 00:00:00Z, end as 23:59:59Z (date portion)
    Output:
      Path to the written file. Filename includes start/end dates.
    """
    bronze_root = VAR["containers"]["bronze"].rstrip("/")
    out_dir = f"{bronze_root}/space_weather/gfz"

    start_iso = normalize_utc(startdate, is_start=True)
    end_iso   = normalize_utc(enddate, is_start=False)

    start_day = start_iso[:10]
    end_day   = end_iso[:10]

    run_tag = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

    dt_vals, idx_vals, _ = getKpindex(start_iso, end_iso, idx, "def")

    csv_lines = [f"datetime,{idx}"]
    for d, v in zip(dt_vals, idx_vals):
        csv_lines.append(f"{d},{v}")

    filename = f"gfz_index-{idx}_start-{start_day}_end-{end_day}_tag-{run_tag}.csv"
    return write_single_file(out_dir, filename, "\n".join(csv_lines))


def save_nmdb(startdate: str, enddate: str, stations:list = ["JUNG1", "OULU", "ROME"]) -> str:
    """
    Save NMDB NEST (JUNG1, OULU, ROME) raw ASCII to a single TXT file in Bronze.

    Inputs:
      startdate, enddate: 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SSZ'
        - If date-only: start is treated as 00:00:00Z, end as 23:59:59Z (date portion)
    Output:
      Path to the written file. Filename includes start/end dates.
    """
    bronze_root = VAR["containers"]["bronze"].rstrip("/")
    out_dir = f"{bronze_root}/space_weather/nmdb"
    
    start_iso = normalize_utc(startdate, is_start=True)
    end_iso   = normalize_utc(enddate, is_start=False)

    start_day = start_iso[:10]
    end_day   = end_iso[:10]

    run_tag = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

    text = getNMDBnest(start_iso, end_iso,stations)  # your function should accept the UTC ISO strings

    stations_names = "-".join(stations.sort())
    filename = f"nmdb_stations-{stations_names}_start-{start_day}_end-{end_day}_tag-{run_tag}.txt"
    return write_single_file(out_dir, filename, text)


In [None]:
from datetime import datetime, timezone, timedelta

def bootstrap_2025tonow():
    """
    Bootstrap Plan A (GFZ Hp30 + NMDB JUNG1/OULU/ROME) into Bronze
    ONLY if the *space_weather* folder is empty.

    Uses:
      - normalize_utc(ts, is_start=...)
      - save_gfz(startdate, enddate)
      - save_nmdb(startdate, enddate)
    """
    bronze_root = VAR["containers"]["bronze"].rstrip("/")
    space_weather_root = f"{bronze_root}/space_weather"

    # --- detect emptiness of the target area (not the whole container) ---
    def _is_empty(path: str) -> bool:
        try:
            return len(dbutils.fs.ls(path)) == 0
        except Exception:
            return True  # missing/unlistable -> treat as empty

    if not _is_empty(space_weather_root):
        print(f"space_weather is not empty -> skipping bootstrap. Root: {space_weather_root}")
        return {}

    # --- window: 2025-01-01 .. yesterday (UTC) ---
    startdate = "2025-01-01"
    enddate = (datetime.now(timezone.utc) - timedelta(days=1)).date().isoformat()

    # normalize once to validate + for display
    start_iso = normalize_utc(startdate, is_start=True)
    end_iso = normalize_utc(enddate, is_start=False)

    if end_iso < "2025-01-01T00:00:00Z":
        raise ValueError(f"Computed end={end_iso} is before 2025-01-01; refusing to bootstrap.")

    written = {}
    written["gfz_hp30"] = save_gfz(start_iso, end_iso)
    written["nmdb"] = save_nmdb(start_iso, end_iso)

    print("Bootstrap complete. Written paths:")
    for k, v in written.items():
        print(f"- {k}: {v}")

    return written


----------

**Reminder note:**

Upgrade from **Plan A → Plan B** when you need **ionosphere / GNSS disruption visibility** (not just geomagnetic indices + neutron counts).
Plan A (GFZ + NMDB) covers:

* **Geomagnetic disturbance level** (Kp/ap/Hp60) → grid/infrastructure risk context
* **Cosmic-ray background proxy** (neutron monitor counts) → soft-error / quantum correlated-error context

Plan B adds:

* **ESA Space Weather (SWE) API products** to capture **ionospheric scintillation / TEC-like disturbance signals** and **event/alarm-style space-weather products**, improving relevance for **telecom timing, GNSS reliability, and operational impact analysis**.


In [10]:
dt_vals, idx_vals, st_vals = getKpindex("2025-11-01", "2025-12-01", "Hp30", "def")

print(dt_vals)
print(idx_vals)
print(st_vals)


('2025-11-01T00:00:00Z', '2025-11-01T00:30:00Z', '2025-11-01T01:00:00Z', '2025-11-01T01:30:00Z', '2025-11-01T02:00:00Z', '2025-11-01T02:30:00Z', '2025-11-01T03:00:00Z', '2025-11-01T03:30:00Z', '2025-11-01T04:00:00Z', '2025-11-01T04:30:00Z', '2025-11-01T05:00:00Z', '2025-11-01T05:30:00Z', '2025-11-01T06:00:00Z', '2025-11-01T06:30:00Z', '2025-11-01T07:00:00Z', '2025-11-01T07:30:00Z', '2025-11-01T08:00:00Z', '2025-11-01T08:30:00Z', '2025-11-01T09:00:00Z', '2025-11-01T09:30:00Z', '2025-11-01T10:00:00Z', '2025-11-01T10:30:00Z', '2025-11-01T11:00:00Z', '2025-11-01T11:30:00Z', '2025-11-01T12:00:00Z', '2025-11-01T12:30:00Z', '2025-11-01T13:00:00Z', '2025-11-01T13:30:00Z', '2025-11-01T14:00:00Z', '2025-11-01T14:30:00Z', '2025-11-01T15:00:00Z', '2025-11-01T15:30:00Z', '2025-11-01T16:00:00Z', '2025-11-01T16:30:00Z', '2025-11-01T17:00:00Z', '2025-11-01T17:30:00Z', '2025-11-01T18:00:00Z', '2025-11-01T18:30:00Z', '2025-11-01T19:00:00Z', '2025-11-01T19:30:00Z', '2025-11-01T20:00:00Z', '2025-11-01T20: