# Mirror Staged Landsat L1C Demo Collections


**Purpose:** This notebook provides a way to mirror hand-curated demo collections and items from the staging catalog to the production catalog. The original curation work is captured veda-data [veda-data/transformation-scripts/landsat-lakes-discovery](https://github.com/NASA-IMPACT/veda-data/tree/main/transformation-scripts/landsat-lakes-discovery) but because these collections have been corrected over time, this notebook will use the staging STAC catalog as the source of truth.

> Assertions
> (1) The STAC Item assets are hosted externally so cannot be ingested via VEDA's discovery pipeline AND
> (2) The staging (or source) catalog has extensive hand-curated metadata that we want to mirror exactly rather than re-create.
> (3) We are making some some one-off corrections to some one-off hand-curated collections, read comments if attempting to generalize for wider usage

In [None]:
import json
import requests
from pathlib import Path
from pystac_client import Client
from pystac import Collection, Item

SRC_STAC_API_URL = "https://staging-stac.delta-backend.com"
TARGET_STAC_API_URL = "https://test.openveda.cloud/api/stac"
TARGET_INGEST_API_URL = "https://test.openveda.cloud/api/ingest"

catalog = Client.open(SRC_STAC_API_URL)

TOKEN = "SECRET"
authorization_header = f"Bearer {TOKEN}"
headers = {
    "Authorization": authorization_header,
    "content-type": "application/json",
    "accept": "application/json",
}
authme_url = f"{TARGET_INGEST_API_URL}/auth/me"
response = requests.get(authme_url, headers=headers)
response.reason

In [None]:
src_collection_ids = [
    'landsat-c2l2-sr-antarctic-glaciers-pine-island',
    'landsat-c2l2-sr-antarctic-glaciers-thwaites',
    'landsat-c2l2-sr-lakes-aral-sea',
    'landsat-c2l2-sr-lakes-lake-balaton',
    'landsat-c2l2-sr-lakes-lake-biwa',
    'landsat-c2l2-sr-lakes-tonle-sap',
    'landsat-c2l2-sr-lakes-vanern'
]

## Part 1 Mirror the collection metadata

1. Strip hierarchical and self links to the src catalog
2. Strip `cog_default` item_asset (a bug for these landsat collections)
3. Save file for veda-data/ingestion_inputs/production/collections
4. Validate and write collection to target catalog via the authenticated `{TARGET_INGEST_API_URL}/collections` endpoint

In [None]:
dry_run = True

for collection_id in src_collection_ids:

    # We will also archive the corrected collection to the veda-data repo
    outfile = f"{collection_id}.json"

    src_collection = catalog.get_collection(collection_id)
    
    # Strip the catalog links that are dynamically rendered by the source STAC API
    src_collection.remove_hierarchical_links()
    
    # Start the new collection we will publish
    collection_dict = src_collection.to_dict(include_self_link=False)

    # Special case for these landsat collections: cog_default assets were mistakenly added to item_assets so fix it
    item_assets = collection_dict.get("item_assets")
    item_assets.pop("cog_default", None)
    collection_dict["item_assets"] = item_assets

    # Make sure the summaries object from the staged collection is included in the veda-data record (TODO check why this isn't passed through pystac to_dict)
    collection_dict["summaries"] = src_collection.summaries._summaries

    # Validate
    collection = Collection.from_dict(collection_dict)
    collection.validate()

    # Save to file
    Path(outfile).write_text(
        json.dumps(collection_dict, indent=4) + '\n'
    )

    # Publish to target STAC catalog
    publish_url = f"{TARGET_INGEST_API_URL}/collections"
    if not dry_run:
        publish_response = requests.post(
            publish_url, 
            headers=headers,
            json=collection_dict
        )
        print(f"{collection_id=} {publish_response.reason=}")
    else:
        print(f"POST {publish_url} {collection_id=} {dry_run=}")  

# Part 2 mirror item metadata

> Instead of having a super long and unreadable loop, iterate over the source collections one more time to get items to mirror

1. Strip hierarchical and self links to the src catalog
2. Validate and write item to target catalog via the authenticated `{TARGET_INGEST_API_URL}/ingestions` endpoint (ingestor lambda validates items before loading)

In [None]:
dry_run = False

for collection_id in src_collection_ids:

    search = catalog.search(collections=[collection_id])
    src_item_collection = search.item_collection()
    print(f"Found {len(src_item_collection)} items for {collection_id=}")

    for src_item in src_item_collection.items:

        # Strip the catalog links that are dynamically rendered by the source STAC API
        src_item.remove_hierarchical_links()

        # Start the new item we will publish
        item_dict = src_item.to_dict(include_self_link=False)

        # Add collection link
        links = item_dict["links"]
        links.append({
            "rel": "collection",
            "href": collection_id,
            "type": "application/json"
        })
        item_dict["links"] = links

        # Validate
        item = Item.from_dict(item_dict)
        item.validate()
    
        # Special case for these landsat collections: cog_default assets were mistakenly added to item_assets so fix it
        item_assets = item_dict.get("assets")
        item_assets.pop("cog_default", None)
        item_dict["assets"] = item_assets

        # Publish to target STAC catalog
        publish_url = f"{TARGET_INGEST_API_URL}/ingestions"
        if not dry_run:
            publish_response = requests.post(
                publish_url, 
                headers=headers,
                json=item_dict
            )
            print(f"POST {publish_url} {collection_id=}\n{item_dict['id']=} {publish_response.reason=}")
        else:
            print(f"POST {publish_url} {collection_id=}\n{item_dict['id']=} {dry_run=}")      

## Part 3 Check the target STAC catalog

In [None]:
src_catalog = Client.open(SRC_STAC_API_URL)
target_catalog = Client.open(TARGET_STAC_API_URL)

for collection_id in src_collection_ids:

    src_collection = src_catalog.get_collection(collection_id)
    src_matched = "TODO"
    target_collection = target_catalog.get_collection(collection_id)
    target_matched = "TODO"

    print(f"\n{collection_id} {src_matched=} {target_matched=}")
    break