In [4]:
import boto3
from pathlib import Path


In [9]:
# !pip install -U pystac-client
from pystac_client import Client

## Connecting to a MAAP STAC using Pystac Client

In [46]:
URL = 'https://stac.maap-project.org/'
cat = Client.open(URL)

## Listing all the STAC IDs of all the Collections

In [47]:

stac_collection = []
for collection in cat.get_all_collections():                                  
    print(collection.id)
    stac_collection.append(collection.id)
print(f"Number of collections in STAC :{len(stac_collection)}")

ESACCI_Biomass_L4_AGB_V3_100m_2018
ESACCI_Biomass_L4_AGB_V3_100m_2017
ESACCI_Biomass_L4_AGB_V3_100m_2010
Landsat8_SurfaceReflectance
Global_PALSAR2_PALSAR_FNF
Global_Forest_Change_2000-2017
AFRISAR_DLR2
AfriSAR_UAVSAR_KZ
AfriSAR_UAVSAR_Ungeocoded_Covariance
AfriSAR_UAVSAR_Normalization_Area
AfriSAR_UAVSAR_Geocoded_SLC
AfriSAR_UAVSAR_Geocoded_Covariance
GlobCover_09
GlobCover_05_06
GEDI_CalVal_Field_Data
AfriSAR_UAVSAR_Coreg_SLC
GEDI_CalVal_Lidar_Data_Compressed
GEDI_CalVal_Lidar_Data
ALOS_PSR_RTC_HIGH
ABoVE_UAVSAR_PALSAR
AFRISAR_DLR
BIOSAR1
NASA_JPL_global_agb_mean_2020
icesat2-boreal
ESACCI_Biomass_L4_AGB_V4_100m_2020
ESACCI_Biomass_L4_AGB_V4_100m_2019
ESACCI_Biomass_L4_AGB_V4_100m_2018
ESACCI_Biomass_L4_AGB_V4_100m_2017
ICESat2_Boreal_AGB_tindex_average
Number of collections in STAC :29


## Listing all the Collections under the prefix 'file-staging/nasa-map/' from S3 Bucket 'nasa-maap-data-store'

In [48]:
import boto3
s3 = boto3.client('s3')
collections = []
bucket_name = 'nasa-maap-data-store'

prefix = 'file-staging/nasa-map/'

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')

common_prefixes = response.get('CommonPrefixes')

for common_prefix in common_prefixes:
    path = Path(common_prefix['Prefix'])
    collections.append(path.parts[2])
print(collections)
print(f"Number of collections : {len(collections)}")

['30mATL08_boreal___0', 'ABLVIS1B___001', 'ABLVIS2___001', 'ABLVIS2___1', 'ABoVE_UAVSAR_PALSAR___1', 'AFLVIS1B___001', 'AFLVIS2___001', 'ALOS_PSR_L1.5___1', 'ALOS_PSR_RTC_HIGH___1', 'ATL03___004', 'ATL03___005', 'ATL08_001_user-added___0', 'ATL08_ARD-beta___001', 'ATL08_LAS___004', 'ATL08___004', 'ATL08___005', 'AfriSAR_AGB_Maps_1681___1', 'AfriSAR_KingAir_B200_flight_tracks_Gabon___1', 'AfriSAR_LVIS_Footprint_Cover_1591___1', 'AfriSAR_Mondah_Field_Data_1580___1', 'AfriSAR_UAVSAR_Coreg_SLC___1', 'AfriSAR_UAVSAR_Geocoded_Covariance___1', 'AfriSAR_UAVSAR_Geocoded_SLC___1', 'AfriSAR_UAVSAR_KZ___1', 'AfriSAR_UAVSAR_MLC___2', 'AfriSAR_UAVSAR_Normalization_Area___1', 'AfriSAR_UAVSAR_SLC___2', 'AfriSAR_UAVSAR_Ungeocoded_Covariance___1', 'Afrisar_LVIS_Biomass_VProfiles_1775___1', 'ESACCI_Biomass_L4_AGB_V4_100m_2017', 'ESACCI_Biomass_L4_AGB_V4_100m_2018', 'ESACCI_Biomass_L4_AGB_V4_100m_2019', 'ESACCI_Biomass_L4_AGB_V4_100m_2020', 'EnvStress___1', 'GEDI01_B___001', 'GEDI01_B___002', 'GEDI02_A___

## Finding collections present in S3 but not in STAC

In [49]:
all_stac_collection = [i.lower() for i in stac_collection]
stac_collection_string = ','.join(all_stac_collection)
s3_stac_missing = []
for collection in collections: 
    try:
        slice_idx = collection.index('__')
        collection_name = collection[:slice_idx]
    except ValueError:
        collection_name = collection
    if collection_name.lower() not in stac_collection_string:
        s3_stac_missing.append(collection)
s3_stac_missing

['30mATL08_boreal___0',
 'ABLVIS1B___001',
 'ABLVIS2___001',
 'ABLVIS2___1',
 'AFLVIS1B___001',
 'AFLVIS2___001',
 'ALOS_PSR_L1.5___1',
 'ATL03___004',
 'ATL03___005',
 'ATL08_001_user-added___0',
 'ATL08_ARD-beta___001',
 'ATL08_LAS___004',
 'ATL08___004',
 'ATL08___005',
 'AfriSAR_AGB_Maps_1681___1',
 'AfriSAR_KingAir_B200_flight_tracks_Gabon___1',
 'AfriSAR_LVIS_Footprint_Cover_1591___1',
 'AfriSAR_Mondah_Field_Data_1580___1',
 'AfriSAR_UAVSAR_MLC___2',
 'AfriSAR_UAVSAR_SLC___2',
 'Afrisar_LVIS_Biomass_VProfiles_1775___1',
 'EnvStress___1',
 'GEDI01_B___001',
 'GEDI01_B___002',
 'GEDI02_A___001',
 'GEDI02_A___002',
 'GEDI02_B___002',
 'GEDI_CalVal_Field_USA_Sonoma___1',
 'GEDI_CalVal_Lidar_USA_Sonoma___1',
 'Global_PALSAR2_PALSAR_Mosiac___1',
 'LVISF1B___001',
 'LVISF1B___1',
 'LVISF2___001',
 'LVISF2___1',
 'Landsat7_SurfaceReflectance___1',
 'NISAR_RSLC___001',
 'PolInSAR_Canopy_Height_1589___1',
 'Polarimetric_CT_1601___1',
 'Polarimetric_CT_user-added___0',
 'Polarimetric_height

## Listing collections missing from STAC (present in S3) and their size in GB

In [54]:
size_map = {}
s3 = boto3.resource('s3')
data_bucket = s3.Bucket("nasa-maap-data-store")
for collection in s3_stac_missing:
    collection_size = 0
    print(f"Processing collection: {collection}")
    for obj in data_bucket.objects.filter(Prefix=f"file-staging/nasa-map/{collection}"):
        collection_size+=obj.size
    print(f"Collection size: {collection_size/1000000000} GB")
    size_map[collection] = collection_size

Processing collection: 30mATL08_boreal___0
Collection size: 0.008224004 GB
Processing collection: ABLVIS1B___001
Collection size: 1038.502559429 GB
Processing collection: ABLVIS2___001
Collection size: 111.568167566 GB
Processing collection: ABLVIS2___1
Collection size: 111.562109695 GB
Processing collection: AFLVIS1B___001
Collection size: 110.156656369 GB
Processing collection: AFLVIS2___001
Collection size: 16.356217403 GB
Processing collection: ALOS_PSR_L1.5___1
Collection size: 382.603999518 GB
Processing collection: ATL03___004
Collection size: 261251.882255514 GB
Processing collection: ATL03___005
Collection size: 5.567143698 GB
Processing collection: ATL08_001_user-added___0
Collection size: 0.070835492 GB
Processing collection: ATL08_ARD-beta___001
Collection size: 3588.897073543 GB
Processing collection: ATL08_LAS___004
Collection size: 0.000193003 GB
Processing collection: ATL08___004
Collection size: 14601.77024958 GB
Processing collection: ATL08___005
Collection size: 1641