In [None]:
# Installing necessary libraries
%pip install geopandas
%pip install tqdm

In [None]:
# Importing packages
import requests
import boto3
import geopandas as gpd
from tqdm import tqdm
import re

In [None]:
# Setting up cmr hosts
NASA_HOST = "cmr.earthdata.nasa.gov"
MAAP_HOST = "cmr.maap-project.org"

# Reading the indices for boreal and copernicus
dem = gpd.read_file("dem30m_tiles.geojson")
boreal = gpd.read_file("boreal_grid_albers90k_gpkg.gpkg")

In [None]:
def get_size_from_results(response, format, seen):
    '''
        Based on the response from the post call, format and the seen list,
        gives the total size and num granules from the response
    '''
    if format == "umm_json":
        results = response["items"]
        unique = [result for result in results if result["umm"]["GranuleUR"] not in seen]
        total_size = sum(
            [
                float(
                    granule["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["SizeInBytes"]
                )
                for granule in unique
            ]
        )
        num_granules = len(unique)
        seen.extend([result["umm"]["GranuleUR"] for result in results])
    else:
        results = response["feed"]["entry"]
        total_size = sum([float(result["granule_size"]) for result in results])
        num_granules = len(results)

    return total_size, num_granules

In [None]:
def get_size_estimates(query_dict, host="maap", format="json", seen=[]):
    '''
        Based on the query params (query_dict) to cmr (host: maap or nasa), format and seen list (for de-duplication),
        gives size estimates
    '''
    host = MAAP_HOST if host == "maap" else NASA_HOST
    base_url = f"https://{host}/search/granules.{format}"

    headers = {}

    total_size = 0
    num_granules = 0

    while True:
        if num_granules and (num_granules % 1000 == 0):
            print(f"{num_granules} granules processed")

        response = requests.post(base_url, data=query_dict, headers=headers)
        if response.status_code == 200:
            size, num_grans = get_size_from_results(response.json(), format, seen)
            total_size += size
            num_granules += num_grans

            if search_after := response.headers.get("CMR-Search-After"):
                headers = {"CMR-Search-After": search_after}
            else:
                print("No more granules")
                break
        else:
            print("Response status code:", response.status_code)
            print(response.text)
            break

    return {"total_size": total_size, "num_granules": num_granules}

### Sentinel 1 estimates

In [None]:
# Sentinel estimates
sentinel_dict = {
    "collection_concept_id": "C1214470488-ASF",
    "pageSize": 2000,
    "temporal": "2019-01-01T00:00:00Z,2019-12-31T23:59:59Z",
}

print(
    get_size_estimates(sentinel_dict, host="nasa")
)

### ATL08 v5 estimates

In [None]:
# ATL08 v5 estimates
atl08_dict = {
    'collection_concept_id': "C1201746153-NASA_MAAP",
    'pageSize': 2000,
    'temporal': '2019-06-01T00:00:00Z,2019-09-30T23:59:59Z',
    'bounding_box': '-180,50,180,75',
    'provider': 'NASA_MAAP'
}
print(
    get_size_estimates(atl08_dict)
)

### ATL03 v4 estimates

In [None]:
# ATL03 v4 estimates
atl03_dict = {
    'collection_concept_id': "C1201300747-NASA_MAAP",
    'pageSize': 2000,
    'temporal': '2019-06-01T00:00:00Z,2019-09-30T23:59:59Z',
    'bounding_box': '-180,50,180,75',
    'provider': 'NASA_MAAP'
}
print(
    get_size_estimates(atl03_dict)
)

### Copernicus DEM estimates

In [None]:
def get_copernicus_s3_list():
    '''
        Gets all the s3 files from the dem index that intersect with the boreal index
    '''
    selection = dem[dem.intersects(boreal.to_crs("EPSG:4326").unary_union)]
    return selection["s3"].to_list()

In [None]:
def get_size_aws(list_of_urls):
    '''
        Gets the cumulative size of all the files from the list_of_urls (s3 urls)
    '''
    s3 = boto3.client("s3")
    total_size = 0
    count = 0
    for url in tqdm(list_of_urls):
        split_url = url.replace("s3://", "").split("/")
        bucket, key = split_url[0], "/".join(split_url[1:])
        response = s3.head_object(Bucket=bucket, Key=key)
        total_size += response["ContentLength"]
        count += 1
    return total_size, count

In [None]:
# Copernicus DEM estimates
print(
    get_size_aws(
        get_copernicus_s3_list()
    )
)

### HLS v2 estimates

In [None]:
def _reverse_polygon(polygon):
    '''
        CMR supports counter-clockwise polygons as query params, this function converts the clock-wise polygons that geopandas gives to counterclockwise polygons that cmr accepts
    '''
    reversed = []
    for index in range(len(polygon)-1, -1, -2):
        reversed.append(polygon[index-1])
        reversed.append(polygon[index])
    return ",".join(reversed)

In [None]:
def get_boreal_polygons():
    '''
        Get all the polygons bounds from boreal index, used to make calls to cmr
    '''
    polygons = []
    float_regex = "[+-]?[0-9]*[.][0-9]+"
    for polygon in boreal.to_crs("EPSG:4326")["geometry"]:
        long_lats = re.findall(float_regex, f"{str(polygon)}")
        polygons.append(_reverse_polygon(long_lats))
    return polygons

In [None]:
# HLS v2 estimates
polygons = get_boreal_polygons()
hls_dict = {
    "collection_concept_id": "C2021957295-LPCLOUD",
    "pageSize": 2000
}

hls_seen_granules = []
total_size = 0
num_granules = 0

for year in range(2019, 2022):
    for polygon in tqdm(polygons):
        hls_dict["polygon"] = polygon
        hls_dict["temporal"] = f"{year}-06-01T00:00:00Z,{year}-09-15T23:59:59Z",
        result = get_size_estimates(hls_dict, host="nasa", format="umm_json", seen=hls_seen_granules)

        total_size += result["total_size"]
        num_granules += result["num_granules"]

print( {
    "total_size": total_size,
    "num_granules": num_granules
})