## Generate Geoparquet Data Samples

1. Convert ATL03 samples (whole granules) to geoparquet
2. Use SlideRule with the geoparquet export option to download granules by ID
3. Save to common bucket

Following https://icesat-2-2023.hackweek.io/tutorials/sliderule/parquet-s3.html 

In [41]:
import os
os.environ['USE_PYGEOS'] = '0'
from sliderule import icesat2, earthdata, io
import boto3

In [10]:

#figure out the format sliderule expects granule IDs to be in
# find granules for a spatial and temporal query
box_lon = [-105, -105, -100, -100, -105]
box_lat = [-75, -77.5, -77.5, -75, -75]
poly = io.to_region(box_lon, box_lat)
resources = earthdata.cmr(short_name='ATL03', polygon=poly, time_start='2018-10-19', time_end='2018-10-20') 
granule = resources[0]

granule

'ATL03_20181019224323_03250112_005_01.h5'

In [12]:
!aws s3 ls s3://nasa-cryo-scratch/h5cloud/original/

2023-08-08 23:45:34 7754735138 ATL03_20181120182818_08110112_006_02.h5
2023-08-08 23:47:04 6997123664 ATL03_20190219140808_08110212_006_02.h5
2023-08-08 23:47:04 6925710500 ATL03_20200217204710_08110612_006_01.h5
2023-08-08 23:47:04 8392279594 ATL03_20211114142614_08111312_006_01.h5
2023-08-08 23:47:04 7954039827 ATL03_20230211164520_08111812_006_01.h5


In [4]:
# initialize
icesat2.init("slideruleearth.io")

In [39]:
granules = ['ATL03_20181120182818_08110112_006_02.h5',
            'ATL03_20190219140808_08110212_006_02.h5',
            'ATL03_20200217204710_08110612_006_01.h5',
            'ATL03_20211114142614_08111312_006_01.h5',
            'ATL03_20230211164520_08111812_006_01.h5'] 

In [42]:
# Cyrocloud specific permissions to bucket
client = boto3.client('sts')

with open(os.environ['AWS_WEB_IDENTITY_TOKEN_FILE']) as f:
    TOKEN = f.read()

response = client.assume_role_with_web_identity(
    RoleArn=os.environ['AWS_ROLE_ARN'],
    RoleSessionName=os.environ['JUPYTERHUB_CLIENT_ID'],
    WebIdentityToken=TOKEN,
    DurationSeconds=3600
)

ACCESS_KEY_ID = response['Credentials']['AccessKeyId']
SECRET_ACCESS_KEY_ID = response['Credentials']['SecretAccessKey']
SESSION_TOKEN = response['Credentials']['SessionToken']


In [47]:
def get_gpq(granule):
    #granule = "ATL03_20181120182818_08110112_006_02.h5"
    asset = "icesat2"
    output = f"s3://nasa-cryo-scratch/h5cloud/geoparquet/{granule}.gpq"
    #output = f"/home/jovyan/{granule}.gpq"
    params = {
        "output" : {
            "path" : output,
            "format" : "parquet",
            "open_on_complete" : False,
            "region": "us-west-2",
            "credentials": {
                 "aws_access_key_id": ACCESS_KEY_ID,
                 "aws_secret_access_key": SECRET_ACCESS_KEY_ID,
                 "aws_session_token": SESSION_TOKEN
             }
        }
    }
    status = icesat2.atl03s(parm=params, resource=granule, asset=asset)
    
    return status

In [20]:
%%time
status = icesat2.atl03s(params, granule, asset=asset) 

CPU times: user 53.7 ms, sys: 4.1 ms, total: 57.8 ms
Wall time: 3min 58s


In [None]:
%%time
for granule in granules[1:]:
    result = get_gpq(granule)
    print(result)

s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq
s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq


In [38]:
granules[1:]

['ATL03_20190219140808_08110212_006_02.h5',
 'ATL03_20200217204710_08110612_006_01.h5',
 'ATL03_20211114142614_08111312_006_01.h5',
 'ATL03_20230211164520_08111812_006_01.h5']

In [21]:
status

's3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq'