# Publish reprocessed HLS items using provider generated metadata in s3

#### Assertions
- This notebook follows the [update-hrefs.ipynb](https://github.com/NASA-IMPACT/veda-data/blob/main/transformation-scripts/update-hrefs.ipynb) notebook which updates the provider metadata to use the s3 hrefs for the objects in veda-data-store
- Assumption: the collection metadata in ingestion-data/production/collections is stac version 1.0.0 and has already been published to the target STAC catalog

#### Update the stac version and store objects in s3
- Search for all reprocessed item metadata in `s3://veda-data-store/<collection_id>`
- Update json to stac version 1.0.0, validate, and post back to s3
- Use target VEDA instance's ingest-api/ingestions endpoint to verify hrefs and publish item metadata to STAC

In [None]:
import boto3
import json
import requests
from pystac import Item

# Test
# TARGET_STAC_API_URL = "https://test.openveda.cloud/api/stac"
# TARGET_INGEST_API_URL = "https://test.openveda.cloud/api/ingest"

# Prod
TARGET_STAC_API_URL = "https://openveda.cloud/api/stac"
TARGET_INGEST_API_URL = "https://openveda.cloud/api/ingest"

TOKEN = "SECRET"
authorization_header = f"Bearer {TOKEN}"
headers = {
    "Authorization": authorization_header,
    "content-type": "application/json",
    "accept": "application/json",
}
authme_url = f"{TARGET_INGEST_API_URL}/auth/me"
response = requests.get(authme_url, headers=headers)
response.reason

In [None]:
AWS_ACCESS_KEY_ID = "[CHANGE ME]"
AWS_SECRET_ACCESS_KEY = "[CHANGE ME]"
AWS_SESSION_TOKEN = "[CHANGE ME]"

In [None]:
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
)

## Update the json in s3 to stac version 1.0.0

These provided metadata are `stac_version` `1.0.0-beta.2` but we can make a minor modification to how the `stac_extensions` are provided get them up to the same stac version `1.0.0` used for the rest of the collections in our STAC catalog(s). 


> **WARNING** this cell replaces an existing file in s3 instead of creating a new version, we are using it for a one time cleanup of a small known collection of invalid metadata that need to be corrected. 

In [None]:
bucket_name = "veda-data-store"
collection_ids = ["hlsl30-002-ej-reprocessed", "hlss30-002-ej-reprocessed"]
dry_run = True
verbose = True

for collection_id in collection_ids:
    s3_prefix = f"{collection_id}/"

    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)

    # Filter for the STAC metadata files
    json_keys = [
        obj["Key"]
        for obj in response["Contents"]
        if obj["Key"].endswith("stac-ej-reprocessed.json")
    ]
    print(f"\n{collection_id=} matched metadata for {len(json_keys)} items")

    for key in json_keys:
        # Backup the original version of this metadata
        # deprecated_key = key + ".deprecated"

        # if not dry_run:
        #     s3_client.copy_object(
        #         CopySource={'Bucket': bucket_name, 'Key': key},
        #         Bucket=bucket_name,
        #         Key=deprecated_key,
        #     )
        # if verbose:
        #     print(f"Copied {key} to {deprecated_key}")

        # Get object to update the metadata
        response = s3_client.get_object(Bucket=bucket_name, Key=key)

        item_dict = json.loads(response["Body"].read().decode("utf-8"))

        # Add correct collection link
        links = [link for link in item_dict["links"] if link["rel"] != "collection"]
        links.append(
            {"rel": "collection", "href": collection_id, "type": "application/json"}
        )
        item_dict["links"] = links

        # Update the stac version for these items from "stac_version": "1.0.0-beta.2" and touch up metadata to meet 1.0.0 spec
        item_dict["stac_version"] = "1.0.0"

        # Add full extension hrefs https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#stac_extensions
        item_extensions = item_dict["stac_extensions"]
        stac_extensions = []
        for ext in item_extensions:
            if "https://stac-extensions.github.io" not in ext:
                stac_extensions.append(
                    f"https://stac-extensions.github.io/{ext}/v1.0.0/schema.json"
                )
            else:
                stac_extensions.append(ext)
        item_dict["stac_extensions"] = stac_extensions

        # Make sure the asset hrefs are pointed at the correct collection's prefix
        item_assets = item_dict["assets"]
        # Previous location did not have data version number
        old_prefix = collection_id.replace("-002-", "-")
        for asset_key in item_assets.keys():
            new_href = item_assets[asset_key]["href"].replace(old_prefix, collection_id)
            item_assets[asset_key]["href"] = new_href

        # Validate the updated item
        item = Item.from_dict(item_dict)
        try:
            item.validate()
        except Exception as e:
            print(f"Invalid {collection_id=} {item.id=}")

        # Replace the s3 object with the updated metadata for stac version 1.0.0
        if not dry_run:
            s3_client.put_object(
                Bucket=bucket_name, Key=key, Body=json.dumps(item_dict)
            )
            if verbose:
                print(f"Updated {key}")

## Publish item records to STAC

In [None]:
collection_ids = ["hlsl30-002-ej-reprocessed", "hlss30-002-ej-reprocessed"]
dry_run = True
verbose = True

for collection_id in collection_ids:
    s3_prefix = f"{collection_id}/"

    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)

    # Filter for the STAC metadata files
    json_keys = [
        obj["Key"]
        for obj in response["Contents"]
        if obj["Key"].endswith("stac-ej-reprocessed.json")
    ]
    print(f"\n{collection_id=} matched metadata for {len(json_keys)} items")

    for key in json_keys:
        response = s3_client.get_object(Bucket=bucket_name, Key=key)

        item_dict = json.loads(response["Body"].read().decode("utf-8"))

        # Validate the item
        item = Item.from_dict(item_dict)
        try:
            item.validate()
        except Exception as e:
            print(f"invalid {collection_id=} {item.id=}")

        # Publish to target STAC catalog
        publish_url = f"{TARGET_INGEST_API_URL}/ingestions"
        if not dry_run:
            publish_response = requests.post(
                publish_url, headers=headers, json=item_dict
            )
            if verbose:
                print(
                    f"POST {publish_url} {collection_id=}\n{item_dict['id']=} {publish_response.reason=}"
                )
            if not publish_response.reason == "Created":
                print(
                    f"POST {publish_url} {collection_id=}\n{item_dict['id']=} {publish_response.reason=}"
                )
        else:
            if verbose:
                print(
                    f"POST {publish_url} {collection_id=}\n{item_dict['id']=} {dry_run=}"
                )