# Notebook to Publish Special Collections and Start Discovery Workflow

This notebook publishes the following collections in `/ingestion-data/collections`:
- 'hls-l30-002-ej-reprocessed'
- 'hls-s30-002-ej-reprocessed'
- 'ls8-covid-19-example-data'
- 'landsat-c2l2-sr-antarctic-glaciers-pine-island'
- 'landsat-c2l2-sr-lakes-aral-sea'
- 'landsat-c2l2-sr-lakes-tonle-sap'
- 'landsat-c2l2-sr-lakes-lake-balaton'
- 'landsat-c2l2-sr-lakes-vanern'
- 'landsat-c2l2-sr-antarctic-glaciers-thwaites'
- 'landsat-c2l2-sr-lakes-lake-biwa'
- 'combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO'

In [1]:
import glob
import os
import json
import requests

from cognito_client import CognitoClient

The following cell retrieves collection JSON files from `/ingestion-data/collections/` and save collectionIds to a list.

In [9]:
special_collections = [
    "hls-l30-002-ej-reprocessed",
    "hls-s30-002-ej-reprocessed",
    "ls8-covid-19-example-data",
    "landsat-c2l2-sr-antarctic-glaciers-pine-island",
    "landsat-c2l2-sr-lakes-aral-sea",
    "landsat-c2l2-sr-lakes-tonle-sap",
    "landsat-c2l2-sr-lakes-lake-balaton",
    "landsat-c2l2-sr-lakes-vanern",
    "landsat-c2l2-sr-antarctic-glaciers-thwaites",
    "landsat-c2l2-sr-lakes-lake-biwa",
    "combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO",
    "nceo_africa_2017",
]

collection_file_paths = [
    f"../ingestion-data/collections/{collection}.json"
    for collection in special_collections
]
print(collection_file_paths)

file_paths_and_collection_ids = [
    {"filePath": file_path, "collectionId": data["id"]}
    for file_path in collection_file_paths
    if "id" in (data := json.load(open(file_path, "r")))
]

['../ingestion-data/collections/hls-l30-002-ej-reprocessed.json', '../ingestion-data/collections/hls-s30-002-ej-reprocessed.json', '../ingestion-data/collections/ls8-covid-19-example-data.json', '../ingestion-data/collections/landsat-c2l2-sr-antarctic-glaciers-pine-island.json', '../ingestion-data/collections/landsat-c2l2-sr-lakes-aral-sea.json', '../ingestion-data/collections/landsat-c2l2-sr-lakes-tonle-sap.json', '../ingestion-data/collections/landsat-c2l2-sr-lakes-lake-balaton.json', '../ingestion-data/collections/landsat-c2l2-sr-lakes-vanern.json', '../ingestion-data/collections/landsat-c2l2-sr-antarctic-glaciers-thwaites.json', '../ingestion-data/collections/landsat-c2l2-sr-lakes-lake-biwa.json', '../ingestion-data/collections/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO.json', '../ingestion-data/collections/nceo_africa_2017.json']


Set the testing mode to `True` when testing and `False` otherwise. When the testing mode is `True`, the notebook will be set to run against `dev` endpoints.

In [None]:
testing_mode = True

The following cell compares files in '/ingestion/collections' with those in 'ingestion/staging/discovery-items' or 'ingestion/production/discovery-items' and returns a list of all the discovery-items that have a corresponding collection.

In [10]:
items_in_external_buckets = []


def find_matching_file_names(collections_list, discovery_items_list):
    matching_file_names = []
    for collection_filename in collections_list:
        collection_json = load_json_file(collection_filename)
        id1 = collection_json.get("id")
        if id1 is not None:
            for discovery_items_filename in discovery_items_list:
                item_json = load_json_file(discovery_items_filename)
                if isinstance(item_json, list):
                    if len(item_json) > 0:
                        collection2 = item_json[0].get("collection")
                    if (
                        "bucket" in item_json
                        and item_json[0].get("bucket") != "veda-data-store"
                    ):
                        items_in_external_buckets.append(discovery_items_filename)
                else:
                    collection2 = item_json.get("collection")
                if collection2 is not None:
                    if (
                        "bucket" in item_json
                        and item_json.get("bucket") != "veda-data-store"
                    ):
                        items_in_external_buckets.append(discovery_items_filename)
                    if collection2 == id1:
                        # Found a match
                        matching_file_names.append(discovery_items_filename)
                        break
    return matching_file_names


def load_json_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)


discovery_items_json_file_paths = glob.glob(
    "../ingestion-data/production/discovery-items//*.json"
)

# Find matching file names
matching_file_names = find_matching_file_names(
    collection_file_paths, discovery_items_json_file_paths
)

special_items_to_process = matching_file_names
print(special_items_to_process)
print(items_in_external_buckets)

['../ingestion-data/production/discovery-items/hls-l30-002-ej-reprocessed.json', '../ingestion-data/production/discovery-items/hls-s30-002-ej-reprocessed.json', '../ingestion-data/production/discovery-items/ls8-covid-19-example-data.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-antarctic-glaciers-pine-island.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-lakes-aral-sea.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-lakes-tonle-sap.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-lakes-lake-balaton.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-lakes-vanern.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-antarctic-glaciers-thwaites.json', '../ingestion-data/production/discovery-items/landsat-c2l2-sr-lakes-lake-biwa.json', '../ingestion-data/production/discovery-items/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO.json', '../ingestion-data/production/disco

In [3]:
testing_mode = True

Have your Cognito `username` and `password` ready to set up Cognito Client to retrieve a token that will be used to access the STAC Ingestor API.

In [4]:
test_endpoint = "https://test.openveda.cloud"
test_client_id = "CHANGE ME"
test_user_pool_id = "CHANGE ME"
test_identity_pool_id = "CHANGE ME"

mcp_prod_endpoint = "https://openveda.cloud"
mcp_prod_client_id = "CHANGE ME"
mcp_prod_user_pool_id = "CHANGE ME"
mcp_prod_identity_pool_id = "CHANGE ME"

staging_endpoint = "https://staging-stac.delta-backend.com/"
staging_client_id = "CHANGE ME"
staging_user_pool_id = "CHANGE ME"
staging_identity_pool_id = "CHANGE ME"

if testing_mode:
    STAC_INGESTOR_API = f"{test_endpoint}/api/ingest/"
    VEDA_STAC_API = f"{test_endpoint}/api/stac/"
    WORKFLOWS_API = "https://4hrks0hk0b.execute-api.us-west-2.amazonaws.com/"
else:
    STAC_INGESTOR_API = f"{mcp_prod_endpoint}/api/ingest/"
    VEDA_STAC_API = f"{mcp_prod_endpoint}/api/stac/"
    WORKFLOWS_API = "https://bct2n8in53.execute-api.us-west-2.amazonaws.com/"

client = CognitoClient(
    client_id=staging_client_id,
    user_pool_id=staging_user_pool_id,
    identity_pool_id=staging_identity_pool_id,
)
_ = client.login()

The following cell sets up headers for requests.

In [5]:
TOKEN = client.access_token
authorization_header = f"Bearer {TOKEN}"
headers = {
    "Authorization": authorization_header,
    "content-type": "application/json",
    "accept": "application/json",
}

The following cell defines the function that will post the collection.

In [15]:
def remove_links(item):
    item["links"] = []
    print(f"ITEM {item}")
    return item


def remove_rendered_preview(item):
    if item["assets"]["rendered_preview"]:
        del item["assets"]["rendered_preview"]
    return item


# def add_null_datetime(item):
#     item["properties"]["datetime"] = None


def get_item_to_ingest(collection_id):
    url = f"{staging_endpoint}/collections/{collection_id}/items"
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    json_response = response.json()
    features = json_response.get("features")
    for feature in features:
        # Iterate through links
        for link in feature["links"]:
            # Check if rel is "self"
            if link["rel"] == "self":
                # If rel is "self", extract href
                href = link["href"]
                break  # Exit loop once href is found
        if href:  # If href is found, break outer loop
            break
    return href


def modify_item_before_ingest(item_href):
    try:
        response = requests.get(item_href)
        response.raise_for_status()  # Raise an exception for HTTP errors
        json_content = response.json()  # Parse JSON response
        print("JSON content:", json_content)
        json_content = remove_links(json_content)
        json_content = remove_rendered_preview(json_content)
        # json_content = add_null_datetime(json_content)
        print(f"FINAL {json_content}")
        return json_content
    except requests.exceptions.RequestException as e:
        print("Error fetching JSON content:", e)


# TESTING THINGS
item_to_ingest = get_item_to_ingest("nceo_africa_2017")
finalized_item = modify_item_before_ingest(item_to_ingest)
print(finalized_item)

JSON content: {'id': 'AGB_map_2017v0m_COG', 'bbox': [-18.273529509559307, -35.054059016911935, 51.86423292864056, 37.73103856358817], 'type': 'Feature', 'links': [{'rel': 'collection', 'type': 'application/json', 'href': 'https://staging-stac.delta-backend.com/collections/nceo_africa_2017'}, {'rel': 'parent', 'type': 'application/json', 'href': 'https://staging-stac.delta-backend.com/collections/nceo_africa_2017'}, {'rel': 'root', 'type': 'application/json', 'href': 'https://staging-stac.delta-backend.com/'}, {'rel': 'self', 'type': 'application/geo+json', 'href': 'https://staging-stac.delta-backend.com/collections/nceo_africa_2017/items/AGB_map_2017v0m_COG'}, {'title': 'Map of Item', 'href': 'https://3hwvk17uek.execute-api.us-west-2.amazonaws.com/stac/map?collection=nceo_africa_2017&item=AGB_map_2017v0m_COG&assets=cog_default&rescale=0%2C400&colormap_name=gist_earth_r', 'rel': 'preview', 'type': 'text/html'}], 'assets': {'cog_default': {'href': 's3://nasa-maap-data-store/file-staging/

In [6]:
def post_collection(collection, collection_id):
    collection_url = f"{VEDA_STAC_API}collections/{collection_id}"
    ingest_url = f"{STAC_INGESTOR_API}collections"

    try:
        response = requests.post(ingest_url, json=collection, headers=headers)
        response.raise_for_status()
        if response.status_code == 201:
            print(
                f"Request was successful. Find the updated collection at {collection_url}"
            )
        else:
            print(
                f"ERROR: Updating {collection_id} failed. Request failed with status code: {response.status_code}"
            )
    except requests.RequestException as e:
        print(
            f"ERROR: Updating {collection_id} failed. An error occurred during the request: {e}"
        )
    except Exception as e:
        print(
            f"ERROR: An unexpected error occurred while trying to update {collection_id}: {e}"
        )


failed_ingest_items = []


def ingest_external_item(external_item, external_item_path):
    ingest_url = f"{STAC_INGESTOR_API}ingestion"
    print(ingest_url)
    try:
        response = requests.post(ingest_url, json=external_item, headers=headers)
        response.raise_for_status()
        if response.status_code == 201:
            print(f"Request was successful. {response}")
        else:
            print(
                f"ERROR: Ingesting item for {external_item} failed. Request failed with status code: {response.status_code}"
            )
            failed_ingest_items.append(external_item_path)
    except requests.RequestException as e:
        print(
            f"ERROR: Ingesting item for {external_item} failed. An error occurred during the request: {e}"
        )
        failed_ingest_items.append(external_item_path)
    except Exception as e:
        print(
            f"ERROR: An unexpected error occurred while trying to ingest item for {external_item} failed: {e}"
        )
        failed_ingest_items.append(external_item_path)

NameError: name 'data' is not defined

The following cell publishes the collection to the target ingestion `api/collections` endpoint.

In [None]:
for collection in file_paths_and_collection_ids:
    collection_id = collection["collectionId"]
    file_path = collection["filePath"]

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            collection = json.load(file)

        # Publish the updated collection to the target ingestion `api/collections` endpoint
        post_collection(collection, collection_id)

    except requests.RequestException as e:
        print(f"An error occurred for collectionId {collection_id}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for collectionId {collection_id}: {e}")

The following cell ingests the collection items:

In [None]:
for special_item_path in special_items_to_process:
    try:
        with open(special_item_path, "r", encoding="utf-8") as file:
            discovery_item_json = json.load(file)
            print(discovery_item_json)

        if isinstance(discovery_item_json, list):
            for single_discovery_item_json in discovery_item_json:
                ingest_external_item(single_discovery_item_json, special_item_path)
        else:
            ingest_external_item(discovery_item_json, special_item_path)

    except requests.RequestException as e:
        print(f"An error occurred for item {special_item_path}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for item {special_item_path}: {e}")

In [None]:
print(failed_ingest_items)