# Setup Gabon GEDI L4A Testing

1. Download Gabon outline in geospatial format
2. Save to the workspace, probably not the repo?
3. Query CMR with the bbox of the polygon to find out how many granules are involved

Boundary file is available at `shared-buckets/alexdevseed/iso3/GAB-ADM0.json`

In [1]:
!pip install geopandas profilehooks

[0m

In [26]:
import json
import os
import os.path
import sys
import urllib.parse
from typing import Any, Callable, Mapping, Optional, Iterable, TypeVar

import geopandas as gpd
import h5py
import numpy as np
import requests
from maap.maap import Granule, MAAP
from profilehooks import timecall

T = TypeVar('T')

In [14]:
nasa_cmr_host = 'cmr.earthdata.nasa.gov'
maap_cmr_host = 'cmr.maap-project.org'
maap = MAAP('api.ops.maap-project.org')

## Functions

### General Functions

In [27]:
def for_each(f: Callable[[T], None], xs: Iterable[T]) -> None:
    for x in xs:
        f(x)


def pprint(value: Any) -> None:
    print(json.dumps(value, indent=2))

    
def get_geo_boundary(iso: str, level: int) -> gpd.GeoDataFrame:
    file_path = f'/projects/my-public-bucket/iso3/{iso}-ADM{level}.json'
    
    if not os.path.exists(file_path):
        r = requests.get(
            'https://www.geoboundaries.org/gbRequest.html',
            dict(ISO=iso, ADM=f'ADM{level}')
        )
        r.raise_for_status()
        dl_url = r.json()[0]['gjDownloadURL']
        geo_boundary = requests.get(dl_url).json()

        with open(file_path, 'w') as out:
            out.write(json.dumps(geo_boundary))
    
    return gpd.read_file(file_path)

### UMM Functions

In [5]:
SIZE_UNIT_DEFAULT = 'MB'
SIZE_UNIT_FACTORS = dict(
    KB=1000,
    MB=1000 * 1000,
    GB=1000 * 1000 * 1000,
)


def umm_find_collections(
    cmr_host: str,
    params: Mapping[str, Any],
    **kwargs: Any
) -> Mapping[str, Any]:
    url = f'https://{cmr_host}/search/collections.umm_json'
    r = requests.get(url, params, **kwargs)

    return r.json()


def umm_find_collection(
    cmr_host: str,
    params: Mapping[str, Any],
    **kwargs: Any
) -> Mapping[str, Any]:
    return umm_find_collections(cmr_host, {**params, 'page_size': 1}, **kwargs)['items'][0]


@timecall
def umm_find_granules(
    cmr_host: str,
    params: Mapping[str, Any],
    **kwargs: Any
) -> Mapping[str, Any]:
    method = 'post' if 'data' in kwargs or 'files' in kwargs else 'get'
    url = urllib.parse.urljoin(f'https://{cmr_host}/search', 'granules.umm_json')
    r = requests.request(method, url, params=params, **kwargs)
    
    return r.json()


def umm_granule_size_in_bytes(granule: Mapping[str, Any]) -> int:
    adi = granule['DataGranule']['ArchiveAndDistributionInformation'][0]
    size_unit = adi.get('SizeUnit', SIZE_UNIT_DEFAULT)
    size_unit_factor = SIZE_UNIT_FACTORS.get(size_unit, SIZE_UNIT_FACTORS[SIZE_UNIT_DEFAULT])
    
    return round(adi.get('SizeInBytes', adi.get('Size', 0) * size_unit_factor))

### MAAP Functions

In [20]:
@timecall
def find_granules(**kwargs: Any):
    return maap.searchGranule(**kwargs)


def download_granule(dest_dir: str, *, overwrite=False) -> Callable[[Granule], None]:
    os.makedirs(dest_dir, exist_ok=True)

    @timecall
    def do_download_granule(granule: Granule) -> None:
        granule.getData(dest_dir, overwrite)
    
    return do_download_granule

## Subset Gabon Granules

### Get Gabon Geo Boundary

In [7]:
gabon_gdf = get_geo_boundary('GAB', 0)
gabon_geojson = gabon_gdf.geometry.to_json()
gabon_geodict = json.loads(gabon_geojson)
gabon_gdf

Unnamed: 0,shapeName,shapeISO,shapeID,shapeGroup,shapeType,geometry
0,Gabon,GAB,GAB-ADM0-3_0_0-B1,GAB,ADM0,"MULTIPOLYGON (((8.83154 -0.92271, 8.83809 -0.9..."


### Get GEDI L4A Collection

In [8]:
gedi_l4a_doi = '10.3334/ORNLDAAC/1986'
gedi_l4a = maap.searchCollection(doi=gedi_l4a_doi, limit=1)[0]
gedi_l4a_concept_id = gedi_l4a['concept-id']

gedi_l4a = umm_find_collection(nasa_cmr_host, {'doi': gedi_l4a_doi})
gedi_l4a_concept_id = gedi_l4a['meta']['concept-id']

### Find GEDI L4A Granules within Gabon Bounding Box

In [9]:
granules = find_granules(
    cmr_host=nasa_cmr_host,
    collection_concept_id=gedi_l4a_concept_id,
    bounding_box=','.join(map(str, gabon_gdf.total_bounds)),
    limit=2000
)

print(f'Found {len(granules)} granules')


  find_granules (/tmp/ipykernel_3429/421580757.py:1):
    28.350 seconds



Found 1009 granules


In [10]:
# timecall(for_each)(download_granule('/projects/my-public-bucket/gedi-l4a/gabon'), granules)

In [11]:
# Python 3.9+

# import asyncio

# async def download_all_granules(dest_dir: str, granules: Iterator[Granule]) -> None:
#     await asyncio.gather(*(asyncio.to_thread(download_granule(dest_dir), granule) for granule in granules))

# # asyncio.run(download_all_granules('/projects/my-public-bucket/gedi-l4a/gabon'))
# await download_all_granules('/projects/my-public-bucket/gedi-l4a/gabon', granules[:10])

In [28]:
import concurrent.futures

@timecall
def download_all_granules(dest_dir: str, granules: Iterable[Granule]) -> None:
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        executor.map(download_granule(dest_dir), granules)

In [21]:
download_all_granules('/projects/my-public-bucket/gedi-l4a/gabon', granules[629:700])


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    17.929 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    39.383 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    49.568 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    32.186 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    59.905 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    23.028 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    64.466 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    67.524 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    69.360 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    69.976 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    19.300 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    40.663 seconds


  do_download_g

In [44]:
def make_concurrent_map(max_workers, *, timeout=None):
    @timecall
    def concurrent_map(fn, iterable) -> Iterable:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(fn, iterable, timeout=timeout)
    
    return concurrent_map

download_granule_to_gabon_dir = download_granule('/projects/my-public-bucket/gedi-l4a/gabon')

In [43]:
make_concurrent_map(3)(download_granule_to_gabon_dir, granules[700:750])


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    10.918 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    18.776 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    19.381 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    8.608 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    7.448 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    8.445 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    16.594 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    14.521 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    14.910 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    7.226 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    8.332 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    8.292 seconds


  do_download_granule

In [None]:
make_concurrent_map(5)(download_granule_to_gabon_dir, granules[750:800])

In [None]:
make_concurrent_map(10)(download_granule_to_gabon_dir, granules[800:850])

In [None]:
make_concurrent_map(3)(download_granule_to_gabon_dir, granules[850:900])

In [None]:
make_concurrent_map(4)(download_granule_to_gabon_dir, granules[900:950])

In [None]:
make_concurrent_map(6)(download_granule_to_gabon_dir, granules[950:1000])

In [52]:
# print(f'3 workers: {9790648775 / ??? / 1000 / 1000} MB/s')
print(f'5 workers: {10054004580 / 184.119 / 1000 / 1000} MB/s')
print(f'10 workers: {10350974972 / 227.481 / 1000 / 1000} MB/s')
print(f'3 workers: {11634422576 / 232.740 / 1000 / 1000} MB/s')
print(f'4 workers: {12796077537 / 213.866 / 1000 / 1000} MB/s')
print(f'6 workers: {10522743776 / 208.35 / 1000 / 1000} MB/s')

5 workers: 54.6060133935118 MB/s
10 workers: 45.502591302130725 MB/s
3 workers: 49.98892573687376 MB/s
4 workers: 59.83221988067294 MB/s
6 workers: 50.50512971442284 MB/s


In [53]:
make_concurrent_map(4)(download_granule_to_gabon_dir, granules[1000:])


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    12.709 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    21.322 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    25.417 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    27.842 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    18.978 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    16.361 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    10.106 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    12.593 seconds


  do_download_granule (/tmp/ipykernel_3429/2505500122.py:9):
    15.621 seconds


  concurrent_map (/tmp/ipykernel_3429/1868722619.py:2):
    47.313 seconds



In [None]:
346610177
262161496
74701134
334355223
349997508
295214098
183213634
102799697
346888831