# Notebook to Publish Collections and Start Discovery Workflow

This notebook publishes the collections in `/ingestion-data/collections` excluding:
- 'hls-l30-002-ej-reprocessed'
- 'hls-s30-002-ej-reprocessed'
- 'ls8-covid-19-example-data'
- 'landsat-c2l2-sr-antarctic-glaciers-pine-island'
- 'landsat-c2l2-sr-lakes-aral-sea'
- 'landsat-c2l2-sr-lakes-tonle-sap'
- 'landsat-c2l2-sr-lakes-lake-balaton'
- 'landsat-c2l2-sr-lakes-vanern'
- 'landsat-c2l2-sr-antarctic-glaciers-thwaites'
- 'landsat-c2l2-sr-lakes-lake-biwa'
- 'combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO'

In [None]:
import glob
import os
import json
import requests

The following cell retrieves collection JSON files from `/ingestion-data/collections/` and save collectionIds to a list.

In [None]:
excluded_collections = [
    "hls-l30-002-ej-reprocessed",
    "hls-s30-002-ej-reprocessed",
    "ls8-covid-19-example-data",
    "landsat-c2l2-sr-antarctic-glaciers-pine-island",
    "landsat-c2l2-sr-lakes-aral-sea",
    "landsat-c2l2-sr-lakes-tonle-sap",
    "landsat-c2l2-sr-lakes-lake-balaton",
    "landsat-c2l2-sr-lakes-vanern",
    "landsat-c2l2-sr-antarctic-glaciers-thwaites",
    "landsat-c2l2-sr-lakes-lake-biwa",
    "combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_DEMO",
]

collection_json_file_paths = glob.glob("../ingestion-data/collections/*.json")
filtered_collection_file_paths_list = [
    item
    for item in json_file_paths
    if all(
        excluded_collections not in item
        for excluded_collections in excluded_collections
    )
]
print(filtered_collection_file_paths_list)

file_paths_and_collection_ids = [
    {"filePath": file_path, "collectionId": data["id"]}
    for file_path in filtered_collection_file_paths_list
    if "id" in (data := json.load(open(file_path, "r")))
]

Set the testing mode to `True` when testing and `False` otherwise. When the testing mode is `True`, the notebook will be set to run against `dev` endpoints.

In [None]:
testing_mode = True

The following cell compares files in '/ingestion/collections' with those in 'ingestion/staging/discovery-items' or 'ingestion/production/discovery-items' and returns a list of all the discovery-items that have a corresponding collection.

In [None]:
def find_matching_file_names(collections_list, discovery_items_list):
    matching_file_names = []
    for collection_filename in collections_list:
        collection_json = load_json_file(collection_filename)
        id1 = collection_json.get("id")
        if id1 is not None:
            for discovery_items_filename in discovery_items_list:
                item_json = load_json_file(discovery_items_filename)
                if isinstance(item_json, list):
                    if len(item_json) > 0:
                        collection2 = item_json[0].get("collection")
                else:
                    collection2 = item_json.get("collection")

                if collection2 is not None:
                    if collection2 == id1:
                        # Found a match
                        matching_file_names.append(discovery_items_filename)
                        break
    return matching_file_names


def load_json_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)


discovery_items_json_file_paths = glob.glob(
    "../ingestion-data/production/discovery-items//*.json"
)

# Find matching file names
matching_file_names = find_matching_file_names(
    filtered_collection_file_paths_list, discovery_items_json_file_paths
)

# for file_pair in matching_file_names:
#     print("Match found:")
#     print("File 1:", file_pair[0])
#     print("File 2:", file_pair[1])
discovery_items_to_process = matching_file_names
print(discovery_items_to_process)

Have your Cognito `username` and `password` ready to set up Cognito Client to retrieve a token that will be used to access the STAC Ingestor API.

In [None]:
test_endpoint = "https://test.openveda.cloud"
test_client_id = "CHANGE ME"
test_user_pool_id = "CHANGE ME"
test_identity_pool_id = "CHANGE ME"

mcp_prod_endpoint = "https://openveda.cloud"
mcp_prod_client_id = "CHANGE ME"
mcp_prod_user_pool_id = "CHANGE ME"
mcp_prod_identity_pool_id = "CHANGE ME"

print(f"TESTING MODE? {testing_mode}")
if testing_mode:
    STAC_INGESTOR_API = f"{test_endpoint}/api/ingest/"
    VEDA_STAC_API = f"{test_endpoint}/api/stac/"
    WORKFLOWS_API = "https://4hrks0hk0b.execute-api.us-west-2.amazonaws.com/"
else:
    STAC_INGESTOR_API = f"{mcp_prod_endpoint}/api/ingest/"
    VEDA_STAC_API = f"{mcp_prod_endpoint}/api/stac/"
    WORKFLOWS_API = "https://bct2n8in53.execute-api.us-west-2.amazonaws.com/"

The following cell sets up headers for requests.

In [None]:
TOKEN = "REPLACE ME"

authorization_header = f"Bearer {TOKEN}"
headers = {
    "Authorization": authorization_header,
    "content-type": "application/json",
    "accept": "application/json",
}

In [None]:
failed_discovery_items = []

The following cell defines the function that will post the collection.

In [None]:
def post_collection(collection, collection_id):
    collection_url = f"{VEDA_STAC_API}collections/{collection_id}"
    ingest_url = f"{STAC_INGESTOR_API}collections"

    try:
        response = requests.post(ingest_url, json=collection, headers=headers)
        response.raise_for_status()
        if response.status_code == 201:
            print(
                f"Request was successful. Find the updated collection at {collection_url}"
            )
        else:
            print(
                f"ERROR: Updating {collection_id} failed. Request failed with status code: {response.status_code}"
            )
    except requests.RequestException as e:
        print(
            f"ERROR: Updating {collection_id} failed. An error occurred during the request: {e}"
        )
    except Exception as e:
        print(
            f"ERROR: An unexpected error occurred while trying to update {collection_id}: {e}"
        )


def ingest_discovery_item(discovery_item, discovery_item_path):
    discovery_url = f"{WORKFLOWS_API}discovery"
    print(discovery_url)
    try:
        response = requests.post(discovery_url, json=discovery_item, headers=headers)
        response.raise_for_status()
        if response.status_code == 201:
            print(f"Request was successful. {response}")
        else:
            print(
                f"ERROR: Kicking off discovery for {discovery_item} failed. Request failed with status code: {response.status_code}"
            )
            failed_discovery_items.append(discovery_item_path)
    except requests.RequestException as e:
        print(
            f"ERROR: Kicking off discovery for {discovery_item} failed. An error occurred during the request: {e}"
        )
        failed_discovery_items.append(discovery_item_path)
    except Exception as e:
        print(
            f"ERROR: An unexpected error occurred while trying to kick off discovery for {discovery_item} failed: {e}"
        )
        failed_discovery_items.append(discovery_item_path)

If super_testing_mode is enabled, use a test list against a single collection:

In [None]:
super_testing_mode = False

In [None]:
test_file_paths_and_collection_ids = [file_paths_and_collection_ids[0]]
test_discovery_item = [f"../ingestion-data/production/discovery-items/{file_paths_and_collection_ids[0].get("collectionId")}.json"]

print(test_discovery_item)
print(test_file_paths_and_collection_ids)
print(VEDA_STAC_API)

file_paths_and_collection_ids = (
    test_file_paths_and_collection_ids
    if super_testing_mode
    else file_paths_and_collection_ids
)
discovery_items_to_process = (
    test_discovery_item
    if super_testing_mode
    else discovery_items_to_process
)

print(file_paths_and_collection_ids)
print(discovery_items_to_process)

The following cell publishes the collection to the target ingestion `api/collections` endpoint.

In [None]:
for collection in file_paths_and_collection_ids:
    collection_id = collection["collectionId"]
    file_path = collection["filePath"]

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            collection = json.load(file)

        # Publish the updated collection to the target ingestion `api/collections` endpoint
        post_collection(collection, collection_id)

    except requests.RequestException as e:
        print(f"An error occurred for collectionId {collection_id}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for collectionId {collection_id}: {e}")

The following cell kicks off a `/discovery` workflow for all the discovery items  

In [None]:
for discovery_item_path in discovery_items_to_process:
    try:
        with open(discovery_item_path, "r", encoding="utf-8") as file:
            discovery_item_json = json.load(file)
            print(discovery_item_json)

        if isinstance(discovery_item_json, list):
            for single_discovery_item_json in discovery_item_json:
                ingest_discovery_item(single_discovery_item_json, discovery_item_path)
        else:
            ingest_discovery_item(discovery_item_json, discovery_item_path)

    except requests.RequestException as e:
        print(f"An error occurred for discovery item {discovery_item_path}: {e}")
    except Exception as e:
        print(
            f"An unexpected error occurred for discovery item {discovery_item_path}: {e}"
        )

In [None]:
print(failed_discovery_items)