# Upload GLAD GLCLU2020 STAC metadata
This notebook requires AWS credentials for the SMCE MAAP account, so you will need to generate a temporary set of keys and paste them into the `boto3` session chunk before proceeding.

In [1]:
%pip install obstore stactools-glad-glclu2020 -q

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import asyncio
import json
from concurrent.futures import ThreadPoolExecutor

import boto3
import httpx
import obstore as obs
import tqdm.notebook
from obstore.store import S3Store
from stactools.glad_glclu2020 import create_collection, create_item
from stactools.glad_glclu2020.metadata import CollectionIDs

# STAC ingestory URL
INGESTOR_URL = "https://stac-ingestor.maap-project.org"

# paste MAAP SMCE AWS credentials here:
session = boto3.Session(
    aws_access_key_id="",
    aws_secret_access_key="",
    aws_session_token="",
    region_name="us-west-2",
)
client = session.client("secretsmanager", region_name="us-west-2")

# MAAP STAC secret
response = client.get_secret_value(
    SecretId="arn:aws:secretsmanager:us-west-2:916098889494:secret:MAAP-STAC-auth-dev/MAAP-workflows-EsykqB"
)

settings = json.loads(response["SecretString"])

# function to get token for STAC ingestor
def get_token(
    client_id: str, 
    client_secret: str, 
    domain: str,
    scope: str
) -> str:
    response = httpx.post(
        f"{domain}/oauth2/token",
        headers={
            "Content-Type": "application/x-www-form-urlencoded",
        },
        auth=(client_id, client_secret),
        data={
            "grant_type": "client_credentials",
            "scope": scope,
        },
    )
    try:
        response.raise_for_status()
    except Exception:
        raise

    return response.json()["access_token"]


token = get_token(
    client_id = settings["client_id"],
    client_secret = settings["client_secret"],
    domain = settings["cognito_domain"],
    scope = settings["scope"],
)

In [4]:
async def create_item_async(s3_key, href_format):
    """Wrapper to run create_item in a thread pool"""
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as pool:
        return await loop.run_in_executor(
            pool,
            create_item,
            s3_key,
            href_format
        )

async def process_items(s3_keys, href_format: str, max_concurrent: int = 10):
    """Process multiple items concurrently with a semaphore to limit concurrency"""
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_single_item(s3_key):
        async with semaphore:
            return await create_item_async(
                s3_key,
                href_format=href_format
            )
    
    # Create tasks for all items
    tasks = [process_single_item(key) for key in s3_keys]
    
    # Use tqdm to show progress
    results = []
    for task in tqdm.notebook.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        results.append(await task)
    
    return results


## Find COGs in S3
The GLAD COGs are in the `nasa-maap-data-store` bucket under the specified prefix

In [5]:
s3_bucket = "nasa-maap-data-store"
s3_key_prefix = "file-staging/nasa-map/glad-glclu2020"

s3_store = S3Store.from_session(session, s3_bucket, config={"AWS_REGION": "us-west-2"})

s3_keys_by_year = {
    year: [
        f"s3://{s3_bucket}/{key['path']}"
        for res in obs.list(s3_store, s3_key_prefix + f"/v2/{year}/")
        for key in res
        if key["path"].endswith(".tif")
    ]
    for year in ["2000", "2005", "2010", "2015", "2020", "2000-2020change"]
}
print(s3_keys_by_year["2020"][0])

s3://nasa-maap-data-store/file-staging/nasa-map/glad-glclu2020/v2/2020/00N_000E.tif


## Upload collections to STAC ingestor

In [6]:
annual_collection = create_collection(
    id=CollectionIDs.GLAD_GLCLU2020,
    sample_asset_href=s3_keys_by_year["2020"][0],
)

post_annual = httpx.post(
    f"{INGESTOR_URL}/collections",
    json=annual_collection.to_dict(),
    headers = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',  # Assuming you are sending JSON data
    }
)
print(post_annual.json())

['Successfully published: glad-glclu2020-v2']


In [7]:
change_collection = create_collection(
    id=CollectionIDs.GLAD_GLCLU2020_CHANGE,
    sample_asset_href=s3_keys_by_year["2000-2020change"][0],
)

post_change = httpx.post(
    f"{INGESTOR_URL}/collections",
    json=change_collection.to_dict(),
    headers = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',  # Assuming you are sending JSON data
    }
)
print(post_change.json())

['Successfully published: glad-glclu2020-change-v2']


## Generate item metadata and upload to ingestor

In [10]:
for year, s3_keys in s3_keys_by_year.items():
    print("processing", year)
    loop = asyncio.get_event_loop()
    items = await process_items(
        s3_keys,
        href_format="s3://nasa-maap-data-store/file-staging/nasa-map/glad-glclu2020/v2/{year}/{loc}.tif",
        max_concurrent=20 
    )
    items = list(items)
    
    for item in tqdm.notebook.tqdm(items):
        request = httpx.post(
            f"{INGESTOR_URL}/ingestions",
            json=item.to_dict(),
            headers = {
                'Authorization': f'Bearer {token}',
                'Content-Type': 'application/json',  # Assuming you are sending JSON data
            },
            timeout=60,
        )
        request.raise_for_status()

processing 2005


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

processing 2010


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

processing 2015


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

processing 2020


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

processing 2000-2020change


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]