# Notebook to Reconcile Collection Metadata

This notebook reconciles the collections in `/ingestion-data/collections` and retrieves the summary for each collection from the API, merges it to the existing collection in `veda-data` and posts the new collection to the API.

In [162]:
import glob
import json

import requests
from cognito_client import CognitoClient

Retrieve collections from `/ingestion-data/collections/` and save collectionIds to a list

In [163]:
json_file_paths = glob.glob("../ingestion-data/collections/*.json")

file_paths_and_collection_ids = [
    {"filePath": file_path, "collectionId": data["id"]}
    for file_path in json_file_paths
    if "id" in (data := json.load(open(file_path, "r")))
]
print(file_paths_and_collection_ids)

[{'filePath': '../ingestion-data/collections/campfire-lst-night-diff.json', 'collectionId': 'campfire-lst-night-diff'}, {'filePath': '../ingestion-data/collections/OMI_trno2-COG.json', 'collectionId': 'OMI_trno2-COG'}, {'filePath': '../ingestion-data/collections/lis-global-da-tws.json', 'collectionId': 'lis-global-da-tws'}, {'filePath': '../ingestion-data/collections/EPA-annual-emissions_1B2b_Natural_Gas_Processing.json', 'collectionId': 'EPA-annual-emissions_1B2b_Natural_Gas_Processing'}, {'filePath': '../ingestion-data/collections/grdi-shdi-raster.json', 'collectionId': 'grdi-shdi-raster'}, {'filePath': '../ingestion-data/collections/conus-reach.json', 'collectionId': 'conus-reach'}, {'filePath': '../ingestion-data/collections/EPA-annual-emissions_1B1a_Coal_Mining_Underground.json', 'collectionId': 'EPA-annual-emissions_1B1a_Coal_Mining_Underground'}, {'filePath': '../ingestion-data/collections/ndvi_diff_Ian_2022-09-30_2022-09-05.json', 'collectionId': 'ndvi_diff_Ian_2022-09-30_2022-

Set up Cognito Client to retrieve a token that will be used to access the STAC Ingestor API

In [199]:
dev_endpoint = "https://dev.delta-backend.com/"
dev_client_id = "CHANGE ME"
dev_user_pool_id = "CHANGE ME"
dev_identity_pool_id = "CHANGE ME"

staging_endpoint = "https://staging-stac.delta-backend.com/"
staging_client_id = "CHANGE ME"
staging_user_pool_id = "CHANGE ME"
staging_identity_pool_id = "CHANGE ME"

VEDA_STAC_API = staging_endpoint
client = CognitoClient(
    client_id=staging_client_id,
    user_pool_id=staging_user_pool_id,
    identity_pool_id=staging_identity_pool_id,
)

_ = client.login()

ValidationError: 1 validation error for CognitoClient
username
  ensure this value has at least 1 characters (type=value_error.any_str.min_length; limit_value=1)

In [None]:
json_file_paths = glob.glob("../ingestion-data/collections/*.json")

file_paths_and_collection_ids = [
    {"filePath": file_path, "collectionId": data["id"]}
    for file_path in json_file_paths
    if "id" in (data := json.load(open(file_path, "r")))
]
print(file_paths_and_collection_ids)

[{'filePath': '../ingestion-data/collections/campfire-lst-night-diff.json', 'collectionId': 'campfire-lst-night-diff'}, {'filePath': '../ingestion-data/collections/OMI_trno2-COG.json', 'collectionId': 'OMI_trno2-COG'}, {'filePath': '../ingestion-data/collections/lis-global-da-tws.json', 'collectionId': 'lis-global-da-tws'}, {'filePath': '../ingestion-data/collections/EPA-annual-emissions_1B2b_Natural_Gas_Processing.json', 'collectionId': 'EPA-annual-emissions_1B2b_Natural_Gas_Processing'}, {'filePath': '../ingestion-data/collections/grdi-shdi-raster.json', 'collectionId': 'grdi-shdi-raster'}, {'filePath': '../ingestion-data/collections/conus-reach.json', 'collectionId': 'conus-reach'}, {'filePath': '../ingestion-data/collections/EPA-annual-emissions_1B1a_Coal_Mining_Underground.json', 'collectionId': 'EPA-annual-emissions_1B1a_Coal_Mining_Underground'}, {'filePath': '../ingestion-data/collections/ndvi_diff_Ian_2022-09-30_2022-09-05.json', 'collectionId': 'ndvi_diff_Ian_2022-09-30_2022-

In [192]:
TOKEN = client.access_token
authorization_header = f"Bearer {TOKEN}"
headers = {
    "Authorization": authorization_header,
    "content-type": "application/json",
    "accept": "application/json",
}

Get `summaries` information for each existing collection and merge back to the collection file

In [198]:
def merge_summaries(existing_summaries, retrieved_summaries):
    merged_summaries_dict = existing_summaries.copy()

    if retrieved_summaries:
        for key, value in retrieved_summaries.items():
            merged_summaries_dict.setdefault(key, value)

    return merged_summaries_dict


def retain_external_links(existing_links, retrieved_links):
    unique_hrefs = set(link.get("href") for link in existing_links)
    additional_external_links = [
        link
        for link in retrieved_links
        if link.get("rel") == "external" and link.get("href") not in unique_hrefs
    ]

    retained_links = existing_links + additional_external_links
    return retained_links


for item in file_paths_and_collection_ids:
    collection_id = item["collectionId"]
    file_path = item["filePath"]

    if VEDA_STAC_API == dev_endpoint:
        url = f"{VEDA_STAC_API}api/stac/collections/{collection_id}"
    elif VEDA_STAC_API == staging_endpoint:
        url = f"{VEDA_STAC_API}collections/{collection_id}"

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        json_response = response.json()

        retrieved_summaries = json_response.get("summaries", {})
        retrieved_links = json_response.get("links", {})

        with open(file_path, "r", encoding="utf-8") as file:
            collection = json.load(file)

            existing_summaries = collection.get("summaries", {})
            existing_links = collection.get("links", {})

            collection["summaries"] = merge_summaries(
                existing_summaries, retrieved_summaries
            )
            collection["links"] = retain_external_links(existing_links, retrieved_links)

        with open(file_path, "w") as file:
            json.dump(collection, file, ensure_ascii=False, indent=2)

    except requests.RequestException as e:
        print(f"An error occurred for collectionId {collection_id}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for collectionId {collection_id}: {e}")

An error occurred for collectionId ndvi_diff_Ian_2022-09-30_2022-09-05: 404 Client Error: Not Found for url: https://staging-stac.delta-backend.com/collections/ndvi_diff_Ian_2022-09-30_2022-09-05
An error occurred for collectionId entropy_difference_2022-09-05_2022-09-30: 404 Client Error: Not Found for url: https://staging-stac.delta-backend.com/collections/entropy_difference_2022-09-05_2022-09-30


Publish the updated collections to the target ingestion `api/collections` endpoint

In [None]:
ingestor_staging_url = "https://ig9v64uky8.execute-api.us-west-2.amazonaws.com/staging/"
ingestor_dev_url = "https://dev.delta-backend.com/"
STAC_INGESTOR_API = ingestor_dev_url

In [None]:
for item in file_paths_and_collection_ids:
    with open(item["filePath"], "r") as json_file:
        data_to_post = json.load(json_file)
        collection_url = (
            f"{STAC_INGESTOR_API}api/stac/collections/{item['collectionId']}"
        )
        ingest_url = f"{STAC_INGESTOR_API}api/ingest/collections"

        try:
            response = requests.post(ingest_url, json=data_to_post, headers=headers)
            response.raise_for_status()
            if response.status_code == 201:
                print(f"Request was successful. Find the updated collection at {url}")
            else:
                print(f"Request failed with status code: {response.status_code}")
        except requests.RequestException as e:
            print(f"An error occurred during the request: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")