In [1]:
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "boto3",
#     "httpx",
#     "pandas",
#     "pystac",
#     "pystac-client",
#     "stac-pydantic",
#     "tabulate",
# ]
# ///

# Set MAAP as the 'host' Provider for STAC collections

Many of the STAC collections do not have MAAP set as the 'host' in the list of Providers. This notebook assigns MAAP as the host for collections where the assets are stored in a MAAP bucket.

In [1]:
import json
import time

import boto3
import pandas as pd
import stac_pydantic
from pystac import Collection, Provider, ProviderRole
from pystac_client import Client

client = Client.open("https://stac.maap-project.org")

# fetch all collections
collections = list(client.get_all_collections())



Create a table that shows the existing providers for each collection

In [2]:
data_for_df = []

for collection in collections:
    providers = collection.providers or []
    if not providers:
        data_for_df.append({
            "Collection ID": collection.id,
            "Provider Name": "N/A",
            "Provider Roles": "N/A",
            "Provider Description": "N/A",
            "Provider URL": "N/A",
        })
    else:
        for provider in providers:
            roles_str = ", ".join(role for role in provider.roles)
            data_for_df.append(
                {
                    "Collection ID": collection.id,
                    "Provider Name": provider.name,
                    "Provider Roles": roles_str,
                    "Provider Description": provider.description,
                    "Provider URL": provider.url,
                }
            )


df = (
    pd.DataFrame(data_for_df)
    .sort_values(by=["Collection ID", "Provider Name"])
    .set_index("Collection ID")
)
print(df.to_markdown())


| Collection ID                        | Provider Name                                                             | Provider Roles                | Provider Description                                                                                                                                                                                | Provider URL                                                                           |
|:-------------------------------------|:--------------------------------------------------------------------------|:------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------|
| ABoVE_UAVSAR_PALSAR                  | N/A                                                                       | N/A        

Identify collections where the assets do not appear to be hosted by MAAP based on the asset hrefs.

In [3]:
non_maap_assets = {}
for collection in collections:
    try:
        item = next(collection.get_all_items())
        assets = {key: asset.href for key, asset in item.assets.items() if not asset.href.startswith("s3://nasa-maap-data-store")}
        if assets:
            non_maap_assets[collection.id] = assets
    except StopIteration:
        continue

print(json.dumps(non_maap_assets, indent=2))

{
  "ABoVE_UAVSAR_PALSAR": {
    "https://datapool.asf.alaska.edu/BROWSE/UA/AKLAVI_25702_17070_003_170622_L090_CX_01.gif": "https://datapool.asf.alaska.edu/BROWSE/UA/AKLAVI_25702_17070_003_170622_L090_CX_01.gif"
  },
  "AFRISAR_DLR": {
    "dbf": "https://bmap-catalogue-data.oss.eu-west-0.prod-cloud-ocb.orange-business.com/Campaign_data/afrisar_dlr/afrisar_dlr_roi_RAB100q.dbf",
    "prj": "https://bmap-catalogue-data.oss.eu-west-0.prod-cloud-ocb.orange-business.com/Campaign_data/afrisar_dlr/afrisar_dlr_roi_RAB100q.prj",
    "shp": "https://bmap-catalogue-data.oss.eu-west-0.prod-cloud-ocb.orange-business.com/Campaign_data/afrisar_dlr/afrisar_dlr_roi_RAB100q.shp",
    "shx": "https://bmap-catalogue-data.oss.eu-west-0.prod-cloud-ocb.orange-business.com/Campaign_data/afrisar_dlr/afrisar_dlr_roi_RAB100q.shx"
  },
  "BIOSAR1": {
    "dbf": "https://bmap-catalogue-data.oss.eu-west-0.prod-cloud-ocb.orange-business.com/Campaign_data/biosar1/biosar1_roi_lidar58.dbf",
    "prj": "https://bmap-cat

Steps:
1. Remove 'host' role from any existing providers
2. Make sure there is a MAAP provider and assign the role 'host' to it (unless it is in the list of collections with an alternate host)

In [4]:
updated_collections = {}

alternate_host_providers = {
    "AFRISAR_DLR": Provider(
        name="ESA/ESRIN",
        roles=[ProviderRole.HOST],
        url="https://earth.esa.int",
    ),
    "BIOSAR1": Provider(
        name="ESA/ESRIN",
        roles=[ProviderRole.HOST],
        url="https://earth.esa.int",
    ),
    "nisar-sim": Provider(
        name="ASF",
        description="The Alaska Satellite Facility is part of the Geophysical Institute, located on the University of Alaska Fairbanks campus.",
        url="https://asf.alaska.edu/",
        roles=[ProviderRole.HOST],
    )
}

for collection in collections:
    maap_provider = Provider(
        name="MAAP",
        description="The MAAP platform is designed to combine data, algorithms, and "
        "computational abilities for the processing and sharing of data related to "
        "NASA’s GEDI, ESA’s BIOMASS, and NASA/ISRO’s NISAR missions",
        url="https://maap-project.org",
        roles=[ProviderRole.HOST],
    )
    
    has_maap_provider = False
    
    new_collection = Collection.from_dict(collection.to_dict())
    for provider in new_collection.providers or []:
        if provider.name != "MAAP":
            # remove host designation if provider is not MAAP
            provider.roles = [role for role in provider.roles if role != ProviderRole.HOST]
        else:
            has_maap_provider = True
            roles = provider.roles
            provider = maap_provider
            provider.roles = list(set(provider.roles + roles))
            
    new_collection.providers = [provider for provider in new_collection.providers or [] if provider.roles]

    if alternate_host_provider := alternate_host_providers.get(collection.id):
        new_collection.providers.append(alternate_host_provider)
    elif not has_maap_provider:
        new_collection.providers.append(maap_provider)

    updated_collections[new_collection.id] = new_collection



In [5]:
for collection_id, collection in updated_collections.items():
    print("collection:", collection_id)
    for provider in collection.providers:
        print("provider:", provider.name, ", roles:", provider.roles)
    print("\n")

collection: ABoVE_UAVSAR_PALSAR
provider: MAAP , roles: ['host']


collection: AFRISAR_DLR
provider: ESA/ESRIN , roles: ['host']


collection: AFRISAR_DLR2
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_Coreg_SLC
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_Geocoded_Covariance
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_Geocoded_SLC
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_KZ
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_Normalization_Area
provider: MAAP , roles: ['host']


collection: AfriSAR_UAVSAR_Ungeocoded_Covariance
provider: MAAP , roles: ['host']


collection: BIOSAR1
provider: ESA/ESRIN , roles: ['host']


collection: ESACCI_Biomass_L4_AGB_V4_100m
provider: MAAP , roles: ['host']


collection: GEDI_CalVal_Field_Data
provider: MAAP , roles: ['host']


collection: GEDI_CalVal_Lidar_COPC
provider: MAAP , roles: ['host']


collection: GEDI_CalVal_Lidar_Data
provider: MAAP , roles: ['host']


col

There are some invalid values in these collection records that cause stac-pydantic to reject them (pystac lets some of these through :/)

In [6]:
# fix collection errors
for collection_id, collection in updated_collections.items():
    # track global bbox
    xmin, ymin, xmax, ymax = None, None, None, None
    
    new_bboxes = []
    bboxes = collection.extent.spatial.bboxes
    for i, bbox in enumerate(bboxes):
        if bbox[3] < bbox[1]:
            print(collection_id, f" bbox {i}: fixing invalid bbox")
            bbox = [bbox[0], bbox[3], bbox[2], bbox[1]]
        
        new_bboxes.append(bbox)

        # update global bbox
        xmin = min(xmin or bbox[0], bbox[0])
        ymin = min(ymin or bbox[1], bbox[1])
        xmax = max(xmax or bbox[2], bbox[2])
        ymax = max(ymax or bbox[3], bbox[3])

    if len(new_bboxes) > 1:
        print(collection_id, ": adding global bbox to bboxes")
        new_bboxes.insert(0, [xmin, ymin, xmax, ymax])

    collection.extent.spatial.bboxes = new_bboxes

ABoVE_UAVSAR_PALSAR  bbox 0: fixing invalid bbox
AFRISAR_DLR2  bbox 0: fixing invalid bbox
AfriSAR_UAVSAR_Geocoded_Covariance  bbox 0: fixing invalid bbox
AfriSAR_UAVSAR_Geocoded_SLC  bbox 0: fixing invalid bbox
AfriSAR_UAVSAR_KZ  bbox 0: fixing invalid bbox
AfriSAR_UAVSAR_Normalization_Area  bbox 0: fixing invalid bbox
AfriSAR_UAVSAR_Ungeocoded_Covariance  bbox 0: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 0: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 1: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 2: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 3: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 4: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 5: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 6: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 7: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 8: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 9: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 10: fixing invalid bbox
GEDI_CalVal_Field_Data  bbox 11: fixing

In [7]:
for collection_id, collection in updated_collections.items():
    try:
        # validate with pystac
        collection.validate()
    
        # validate with stac_pydantic
        stac_pydantic.Collection(**collection.to_dict())
    except Exception as e:
        print(collection_id, e)

Post the collections to the STAC loader SNS topic

In [8]:
stac_loader_topic_arn = "arn:aws:sns:us-west-2:916098889494:MAAP-STAC-test-pgSTAC-stacitemloaderTopicD9D06088-LutBraKgk6sT"


sns_client = boto3.client("sns")

collection_list = list(updated_collections.values())

print(f"Total message payloads to send: {len(collection_list)}")
print(f"Messages per batch: 10")

# --- Loop to process message payloads in batches of 10 ---
batch_size = 10

for i in range(0, len(collection_list), batch_size):
    batch_collections = collection_list[i:i + batch_size]

    batch_entries = []
    for j, collection in enumerate(batch_collections):
        unique_batch_id = f"msg-{i + j:04d}"

        json_message_string = json.dumps(collection.to_dict())

        entry = {
            "Id": unique_batch_id,
            "Message": json_message_string,
        }
        batch_entries.append(entry)

    print(f"\n--- Processing batch {int(i/batch_size) + 1} (messages {i} to {i + len(batch_collections) - 1}) ---")

    try:
        # Publish the batch to SNS
        response = sns_client.publish_batch(
            TopicArn=stac_loader_topic_arn,
            PublishBatchRequestEntries=batch_entries
        )

        # Check the response for successful and failed messages
        if "Successful" in response and response["Successful"]:
            print(f"  Successfully published {len(response["Successful"])} messages in this batch.")
            for success in response["Successful"]:
                print(f"    - ID: {success["Id"]}, SNS MessageId: {success["MessageId"]}")
        if "Failed" in response and response["Failed"]:
            print(f"  Failed to publish {len(response["Failed"])} messages in this batch:")
            for failure in response["Failed"]:
                print(f"    - ID: {failure["Id"]}, Code: {failure.get("Code", "N/A")}, Message: {failure.get("Message", "N/A")}")

    except Exception as e:
        print(f"  An error occurred while publishing this batch: {e}")

    # Small delay to avoid hitting API rate limits
    time.sleep(0.1)

print("\n--- All batches processed ---")

Total message payloads to send: 32
Messages per batch: 10

--- Processing batch 1 (messages 0 to 9) ---
  Successfully published 10 messages in this batch.
    - ID: msg-0000, SNS MessageId: db9cb1cc-a149-5e08-95b3-d1295ff63e08
    - ID: msg-0001, SNS MessageId: f1fd587b-5c21-5fa2-ac95-389ea29e6fa2
    - ID: msg-0002, SNS MessageId: fd0ab9ab-8df6-501a-b8f2-d94e7349201a
    - ID: msg-0003, SNS MessageId: 016b9f80-273c-5053-8340-ff65d9836053
    - ID: msg-0004, SNS MessageId: 15cd9950-5401-5f38-87f3-f9b5aabebf38
    - ID: msg-0005, SNS MessageId: 2f365fc9-05e3-5af0-8d13-3f2cfb5cdaf0
    - ID: msg-0006, SNS MessageId: b94e8878-f2f9-5150-ba9a-e89d0c465150
    - ID: msg-0007, SNS MessageId: 9952e227-aba4-5391-aed4-82c2551bc391
    - ID: msg-0008, SNS MessageId: 4f081c67-4e79-5142-82dc-7c82b0c6e142
    - ID: msg-0009, SNS MessageId: c95fd592-8aa3-5e7f-ab19-b577741cae7f

--- Processing batch 2 (messages 10 to 19) ---
  Successfully published 10 messages in this batch.
    - ID: msg-0010, SNS 