In [1]:
# Installing necessary libraries
%pip install geopandas
%pip install tqdm

Collecting geopandas
  Using cached geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
Collecting pyproj>=2.2.0
  Using cached pyproj-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl (7.7 MB)
Collecting shapely>=1.6
  Using cached Shapely-1.8.1.post1-cp39-cp39-macosx_10_9_x86_64.whl (1.2 MB)
Collecting fiona>=1.8
  Using cached Fiona-1.8.21-cp39-cp39-macosx_10_10_x86_64.whl (18.5 MB)
Collecting munch
  Using cached munch-2.5.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: munch, shapely, pyproj, fiona, geopandas
Successfully installed fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.3.0 shapely-1.8.1.post1
You should consider upgrading via the '/Users/sadhikar/repo/MAAP/playground/env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/sadhikar/repo/MAAP/playground/env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use 

In [2]:
# Importing packages
import requests
import boto3
import geopandas as gpd
from tqdm import tqdm
import re



In [3]:
# Setting up cmr hosts
NASA_HOST = "cmr.earthdata.nasa.gov"
MAAP_HOST = "cmr.maap-project.org"

# Reading the indices for boreal and copernicus
dem = gpd.read_file("dem30m_tiles.geojson")
boreal = gpd.read_file("boreal_grid_albers90k_gpkg.gpkg")

In [4]:
def get_size_from_results(response, format, seen):
    '''
        Based on the response from the post call, format and the seen list,
        gives the total size and num granules from the response

        :param response: response from the post call to the cmr
        :param format: format of the data (json or umm_json)
        :param seen: list of seen granules (to avoid duplicates)

        :return: total size in bytes and number of granules
    '''

    # HLS requires umm_json because the default json doesn't have the size of the granule
    if format == "umm_json":
        results = response["items"]
        # Only keep the granules whose `GranuleUR` is not in the seen list
        unique = [result for result in results if result["umm"]["GranuleUR"] not in seen]
        total_size = sum(
            [
                float(
                    granule["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["SizeInBytes"]
                )
                for granule in unique
            ]
        )
        num_granules = len(unique)
        # Add the granuleUR to the seen list
        seen.extend([result["umm"]["GranuleUR"] for result in unique])
    else:
        # For the non-HLS case, the size is in the json response
        # We repeat the same process as above, but with the json response
        results = response["feed"]["entry"]
        unique = [result for result in results if result["producer_granule_id"] not in seen]

        total_size = sum([float(result["granule_size"]) for result in unique])
        num_granules = len(unique)

        seen.extend([result["producer_granule_id"] for result in unique])

    return total_size, num_granules

In [5]:
def get_size_estimates(query_dict, host="maap", format="json", seen=[]):
    '''
        Based on the query params (query_dict) to cmr (host: maap or nasa), format and seen list (for de-duplication),
        gives size estimates

        :param query_dict: dict of query params that the cmr accepts
        :param host: cmr host (maap or nasa)
        :param format: format of the data (json or umm_json)
        :param seen: list of seen granuleids (to avoid duplicates)

        :return: dict of total_size (in bytes) and num_granules
    '''
    # Setting the host and url based on the inputs
    host = MAAP_HOST if host == "maap" else NASA_HOST
    base_url = f"https://{host}/search/granules.{format}"

    headers = {}

    # Initialize the size and count
    total_size = 0
    num_granules = 0

    # The cmr results are paginated, so we need to loop through the pages until all the granules are accounted for
    while True:
        if num_granules and (num_granules % 1000 == 0):
            print(f"{num_granules} granules processed")

        # Making the post call to cmr
        response = requests.post(base_url, data=query_dict, headers=headers)
        if response.status_code == 200:
            # Calculate size and granule count and add them up
            size, num_grans = get_size_from_results(response.json(), format, seen)
            total_size += size
            num_granules += num_grans
        
        # If there's more granules in the next page, update header and run again, else break
            if search_after := response.headers.get("CMR-Search-After"):
                headers = {"CMR-Search-After": search_after}
            else:
                print("No more granules")
                break
        # If the response is not 200, print the error and break
        else:
            print("Response status code:", response.status_code)
            print(response.text)
            break

    return {"total_size": total_size, "count": num_granules}

### Sentinel 1 estimates

In [6]:
# Build the query params for Sentinel 1 to send to cmr, with the given temporal range
sentinel_dict = {
    "collection_concept_id": "C1214470488-ASF",
    "pageSize": 2000,
    "temporal": "2019-01-01T00:00:00Z,2019-12-31T23:59:59Z",
}

# Call the get_size_estimates function to get the size estimates
# The data is hosted in the nasa cmr (thus, host="nasa")
sentinel_estimates = get_size_estimates(sentinel_dict, host="nasa")
print(sentinel_estimates)

2000 granules processed
4000 granules processed
6000 granules processed
8000 granules processed
10000 granules processed
12000 granules processed
14000 granules processed
16000 granules processed
18000 granules processed
20000 granules processed
22000 granules processed
24000 granules processed
26000 granules processed
28000 granules processed
30000 granules processed
32000 granules processed
34000 granules processed
36000 granules processed
38000 granules processed
40000 granules processed
42000 granules processed
44000 granules processed
46000 granules processed
48000 granules processed
50000 granules processed
52000 granules processed
54000 granules processed
56000 granules processed
58000 granules processed
60000 granules processed
62000 granules processed
64000 granules processed
66000 granules processed
68000 granules processed
70000 granules processed
72000 granules processed
74000 granules processed
76000 granules processed
78000 granules processed
80000 granules processed
8200

### ATL08 v5 estimates

In [7]:
# Build the query params for ATL08 v5 to send to cmr, with the given temporal range and bounding box
atl08_dict = {
    'collection_concept_id': "C1201746153-NASA_MAAP",
    'pageSize': 2000,
    'temporal': '2019-06-01T00:00:00Z,2019-09-30T23:59:59Z',
    'bounding_box': '-180,50,180,75',
    'provider': 'NASA_MAAP'
}

# Call the get_size_estimates function to get the size estimates
# The data is hosted in the maap cmr (host is defaulted to "maap")
atl08_estimates = get_size_estimates(atl08_dict)
print(atl08_estimates)

2000 granules processed
4000 granules processed
No more granules
{'total_size': 382292.7138280845, 'count': 5882}


### ATL03 v4 estimates

In [8]:
# Build the query params for ATL03 v4 to send to cmr, with the given temporal range and bounding box
atl03_dict = {
    'collection_concept_id': "C1201300747-NASA_MAAP",
    'pageSize': 2000,
    'temporal': '2019-06-01T00:00:00Z,2019-09-30T23:59:59Z',
    'bounding_box': '-180,50,180,75',
    'provider': 'NASA_MAAP'
}

# Call the get_size_estimates function to get the size estimates
# The data is hosted in the maap cmr (host is defaulted to "maap")
atl03_estimates = get_size_estimates(atl03_dict)
print(atl03_estimates)

2000 granules processed
4000 granules processed
6000 granules processed
No more granules
{'total_size': 11310057.160719872, 'count': 6593}


### Copernicus DEM estimates

In [9]:
def get_copernicus_s3_list():
    '''
        Gets all the s3 files from the dem index that intersect with the boreal index
    '''
    # Get the intersection of indices from boreal and copernicus dem
    selection = dem[dem.intersects(boreal.to_crs("EPSG:4326").unary_union)]
    # Get all the s3 file urls from the intersection
    return selection["s3"].to_list()

In [10]:
def get_size_aws(list_of_urls):
    '''
        Gets the cumulative size of all the files from the list_of_urls (s3 urls)

        :param: list_of_urls: list of s3 urls in the form s3://<bucket>/<path to file>

        :return: total size in bytes and count of files
    '''
    s3 = boto3.client("s3")
    total_size = 0
    count = 0
    for url in tqdm(list_of_urls):
        # Get bucket and key from the s3:// url
        split_url = url.replace("s3://", "").split("/")
        bucket, key = split_url[0], "/".join(split_url[1:])

        # Make a head request to get the size of the file
        response = s3.head_object(Bucket=bucket, Key=key)

        # Add to size and increment count
        total_size += response["ContentLength"]
        count += 1

    return {
        "total_size": total_size,
        "count": count
    }

In [11]:
# Calls the get_size_aws function to get the size estimates for all the s3 urls
copernicus_estimates = get_size_aws(
    # Get the list of s3 urls from the dem intersection boreal indices
    get_copernicus_s3_list()
)
print(copernicus_estimates)

100%|██████████| 5994/5994 [15:27<00:00,  6.46it/s]

{'total_size': 140900308827, 'count': 5994}





### HLS v2 estimates

In [12]:
def _reverse_polygon(polygon):
    '''
        CMR supports counter-clockwise polygons as query params, this function converts the clock-wise polygons that geopandas gives to counterclockwise polygons that cmr accepts

        :param polygon: polygon in the form [x1, y1, x2, y2, ...] in clockwise order

        :param: polygon in the form "x1,y1,x2,y2,..." in counterclockwise order (format that cmr accepts)
    '''
    reversed = []
    # Read the polygon from end to start
    for index in range(len(polygon)-1, -1, -2):
        # Add a pair of coordinates to the reversed list
        reversed.append(polygon[index-1])
        reversed.append(polygon[index])
    return ",".join(reversed)

In [13]:
def get_boreal_polygons():
    '''
        Get all the polygons bounds from boreal index, used to make calls to cmr
    '''
    polygons = []
    # Regex to find all the floating point numbers in a string
    float_regex = "[+-]?[0-9]*[.][0-9]+"

    for polygon in boreal.to_crs("EPSG:4326")["geometry"]:
        # Finds all the floating point numbers (which are coordinates) in the polygon
        long_lats = re.findall(float_regex, f"{str(polygon)}")
        # Reverses them and adds to the polygons list
        polygons.append(_reverse_polygon(long_lats))
    return polygons

In [None]:
# Get all the polygons from the boreal index
polygons = get_boreal_polygons()

# Build query params for the cmr call for HLS v2
hls_dict = {
    "collection_concept_id": "C2021957295-LPCLOUD",
    "pageSize": 2000
}

# Initialize an empty list to store the granuleids that are already accounted for (to avoid duplicates)
hls_seen_granules = []

# Initialize total size and number of granules
total_size = 0
num_granules = 0

# Iterate through years 2019 - 2021
for year in range(2019, 2022):
    # For each of the polygon, we make a cmr call to get the granules, and add them to the total
    # We use the hls_seen_granules list to avoid duplicates
    for polygon in tqdm(polygons):
        hls_dict["polygon"] = polygon
        hls_dict["temporal"] = f"{year}-06-01T00:00:00Z,{year}-09-15T23:59:59Z",
        result = get_size_estimates(hls_dict, host="nasa", format="umm_json", seen=hls_seen_granules)

        total_size += result["total_size"]
        num_granules += result["count"]

hls_estimates = {
    "total_size": total_size,
    "count": num_granules
}

#### Combining all the estimates related to Boreal to get the total size estimates

In [18]:
combined_boreal = [atl03_estimates, atl08_estimates, copernicus_estimates, hls_estimates]
combined_boreal_estimates = {
    "total_size": sum([x["total_size"] for x in combined_boreal]),
    "count": sum([x["count"] for x in combined_boreal])
}
print(combined_boreal_estimates)

{'total_size': 140912001176.87454, 'count': 18469}
