In [None]:
# To use multi-processing, please install the following package
# The pros on using pymp over default multiprocessing is that the default one require prepare the data before hand
# for instance, when building the index with dggrid4py, it requries a geo pandas dataframe which we have to build either a big dataframe
# then trunck it for multi process (cost time) or build individual dataframe (cost memory)
# For pymp, the dataframe can be create at each process start then throw it away, so the max dataframe concurrent exists is the number of multiprocess. 
!pip install pymp-pypi
!pip install geoviews

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

# If query from STAC, need to clone the source from https://gitlab.ut.ee/geog/lgeo_datacube.git
# Change the following path if need 
os.sys.path.append('../')
os.sys.path.append('../../lgeo_datacube/apps/xarraySTAC')
from STACEntrypoint import STACEntrypoint
import xdggs

import numpy as np
import xarray as xr
import zarr
import rasterio
import time 

import shapely
# Env Var setup for the dggrid 
os.environ['DGGRID_PATH']='/home/dick/micromamba/envs/geo/bin/dggrid'
# The resolution to be use in generating cell id , -1 == auto (by apporixmation of sphere's surface integral )
resolution=-1

### Query the asset from STAC and open it as xarray

In [2]:
# STAC query parameter
searchpara = {
        'max_items' : 100,
        'collections' : ['esa-cci-2004-clipped-demo']
}
# xarray open_dataset parameter, open in dask 
openpara = {
        'chunks' : 'auto'
}
# If the assets require auth (GCP credential json file)
with rasterio.Env(GOOGLE_APPLICATION_CREDENTIALS='../../lgeo_datacube/apps/stac_catalog/config_dev/glomodat-stac-testing-svc.json'):
    stacdataset = STACEntrypoint()
    stacdataset = stacdataset.open_dataset('https://maps.landscape-geoinformatics.org/stac', stacsearchparam=searchpara, opendatasetpara=openpara)

Landscape Geoinformatics Lab (University of Tartu) pygeoapi
gs://hytruck/original/esa-cci-2004-clipped-demo/esa_cci_2004_clipped_resample_4326.tif


In [3]:
findcollections = list(stacdataset.keys())
print(findcollections)

['esa-cci-2004-clipped-demo']


In [4]:
for i in stacdataset[findcollections[0]]['items']:
    print(i.attrs['id'])

esa_cci_2004_clipped_resample_4326.tif


In [5]:
dataset = stacdataset[findcollections[0]]['items'][0]
dataset

### Prepare to convert index from lon,lat to cell id
There are two ways to generate dggs cell id. Currently the library only support fixed naming , which is lat lon

   **Method A: Set as xindex**
   - 1. Rename (x ,y) to (lon, lat)
   - 2. Stack over x and y , the shape of the dataset will become N x 1 , where N = (x*y) 
   - 3. Create a variable with data in tuple format , ex: (lon,lat)
   - 4. Set the required parameters for creating dggs idx in the newly created variable's attrs
   - 5. Finally, set the new variable as index by set_xindexs
    
   **Method B: Set index when performing stack**
   - 1. Rename (x ,y) to (lon, lat)
   - 2. Set the required parameters for creating dggs idx in either 'lat' or 'lon' variable's attrs
   - 3. Finally, set the new variable as index by dataset.stack(cell_ids=['lon','lat'],index_cls=xdggs.DGGSIndex)

### Method A

In [28]:
# Step 1.
dataset_methodA = dataset.rename({'x':'lon','y':'lat'})
print('Step 1 completed')

# Step 2
start=time.time()
dataset_methodA = dataset_methodA.stack(cell_ids=['lat','lon'])
end=time.time()
print(f'Step 2 completed {end-start}')

# Step 3.
start=time.time()
dataset_methodA = dataset_methodA.assign_coords({'cell_ids': (('lat','lon'),[ i for i in dataset_methodA.cell_ids.data] )})
end=time.time()
print(f'Step 3 completed {end-start}')

# Step 4.
dataset_methodA['cell_ids'].attrs= {
                        'grid_name': 'isea',
                        'resolution': resolution,
                        'aperture': 7,
                        'topology': 'h',
                        'mp': 12,
                        'trunk': 250000,
                        'epsg': dataset.attrs['assets']['gsdata']['proj:epsg']['epsg']
}
print('Step 4 completed')

# Step 5.
start=time.time()
dataset_methodA = dataset_methodA.set_xindex('cell_ids',xdggs.DGGSIndex)
end=time.time()
print(f'Step 5 completed {end-start}')
# Some performance note for Method A: 

# For pixel resoultion : x: 9816 y: 3663
# For Step 2 and Step3 Method A roughtly use 6GB RAM for processing. It tooks ~7mins with 12 cores for conversion and ~32mins for signle core

# For pixel resoultion : x: 43200 y: 21600
# For Step 2 Method A roughtly use 24GB RAM while processing, those memory usage will be release after finished  
# After that, if accessing the the cell_ids (ex Step 3), the memory usage will blows up to 29GB (max memory of notebook) and keep on consuming page memory

Step 1 completed
Step 2 completed 1.070533037185669
Step 3 completed 8.206407070159912
Step 4 completed
Data type : float64 , shape : (35956008, 2)
Create index from lat,lon
Total Bounds: [-54.16980391  76.98571501  81.51918143  77.33133137]
Total Bounds Area (km^2): 129155.31541061947
Area per center point (km^2): 0.5166212616424779
Auto resolution : 10
Step 3 completed 8.206407070159912
Step 4 completed
Data type : float64 , shape : (35956008, 2)
Create index from lat,lon
Total Bounds: [-54.16980391  76.98571501  81.51918143  77.33133137]
Total Bounds Area (km^2): 129155.31541061947
Area per center point (km^2): 0.5166212616424779
Auto resolution : 10
Step 3 completed 8.206407070159912
Step 4 completed
Data type : float64 , shape : (35956008, 2)
Create index from lat,lon
Total Bounds: [-54.16980391  76.98571501  81.51918143  77.33133137]
Total Bounds Area (km^2): 129155.31541061947
Area per center point (km^2): 0.5166212616424779
Auto resolution : 10
Step 3 completed 8.20640707015991

  0%|                                                                                                                                                                  | 0/12 [00:00<?, ?it/s]

Step 3 completed 8.206407070159912
Step 4 completed
Data type : float64 , shape : (35956008, 2)
Create index from lat,lon
Total Bounds: [-54.16980391  76.98571501  81.51918143  77.33133137]
Total Bounds Area (km^2): 129155.31541061947
Area per center point (km^2): 0.5166212616424779
Auto resolution : 10


  0%|                                                                                                                                                                  | 0/12 [00:08<?, ?it/s]
  0%|                                                                                                                                                                  | 0/12 [00:08<?, ?it/s]
  0%|                                                                                                                                                                  | 0/12 [00:08<?, ?it/s]
  0%|                                                                                                                                                                  | 0/12 [00:08<?, ?it/s]
  0%|                                                                                                                                                                  | 0/12 [00:08<?, ?it/s]
  0%|                                        

KeyboardInterrupt: 

In [7]:
dataset_methodA=dataset_methodA.drop_dims(['lon','lat'])
dataset_methodA

In [10]:
# Number of unique idx
np.unique(dataset_methodA.cell_ids.data).shape

(35632817,)

In [8]:
# Save to zarr with compression
compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=2)
dataset_methodA.to_zarr(f'{dataset.attrs['id']}_methodA.zarr',encoding={"band_data": {"compressor": compressor}})

<xarray.backends.zarr.ZarrStore at 0x7f43653c2140>

In [17]:
# load back from zarr and re-instantiate the ISEA index
zarr_MethodA = xr.open_zarr('esa_cci_2004_clipped_resample_4326.tif_methodA.zarr/')

In [18]:
zarr_MethodA

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 274.32 MiB 1.07 MiB Shape (1, 35956008) (1, 140454) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",35956008  1,

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [19]:
zarr_MethodA=zarr_MethodA.drop_indexes('cell_ids').set_xindex('cell_ids',xdggs.DGGSIndex)
zarr_MethodA

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 274.32 MiB 1.07 MiB Shape (1, 35956008) (1, 140454) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",35956008  1,

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [20]:
zarr_MethodA.xindexes.get('cell_ids')._resolution = 10

In [21]:
zarr_MethodA.xindexes.get('cell_ids')._resolution 

10

In [25]:
# Some lon, lat reference.
#[(-54.169803910617055, 77.33133137161553),
#       (-54.169803910617055, 77.31750671697031),
#       (-54.169803910617055, 77.3036820623251) ]
#(longitude=[28.4,20.2],latitude=[57,60]
zarr_MethodA.dggs.sel_latlon(longitude=[77.33133137161553,77.3036820623251],latitude=[-54.169803910617055])

[[ 77.33133137 -54.16980391]
 [ 77.30368206 -54.16980391]]
cells id: {'cell_ids': array([1223378460, 1223260805])} <class 'dict'>
[False False False ... False False False]


Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(1, 2)","(1, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16 B 16 B Shape (1, 2) (1, 2) Dask graph 1 chunks in 3 graph layers Data type float64 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(1, 2)","(1, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [26]:
geojson = {
  "coordinates": [
    [[-2.4609375,56.5804494],
     [-1.7578125,51.2053036],
    [3.6914063,51.5343963],
      [ 3.1640625,57.0612533],
      [-2.4609375,56.2889971],
      [-2.4609375,56.5804494]]],
   "type": "Polygon"
}
#s=shapely.geometry.Polygon()
zarr_MethodA.dggs.dggrid_polygon_for_extent(geojson,'4326')

cells id: {'cell_ids': array([291936822, 291953627, 291953628, ..., 564998729, 564998730,
       565015536])} <class 'dict'>
[False False False ... False False False]


Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(1, 159632)","(1, 6586)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.22 MiB 51.45 kiB Shape (1, 159632) (1, 6586) Dask graph 31 chunks in 3 graph layers Data type float64 numpy.ndarray",159632  1,

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(1, 159632)","(1, 6586)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


### Method B

Method B is an experimental method that try to overcome the memory problem on large pixel resoultion. For instance, the issue that mentioned above in Step 3.
From experiment result, Method B used around 20GB RAM for (x: 43200 y: 21600), and 20GB of page memory. The total index calculation time is around 2hrs with 12 cores. 

Furthermore, with resolution (x: 9816 y: 3663), the use of xarray broadcast to create the lon,lat pair is much faster (~0.09s) in compare to Method A (Step 2. ~2s )

In [None]:
# Step 1.
dataset_methodB = dataset.rename({'x':'lon','y':'lat'})
print('Step 1 completed')

# Step 2
dataset_methodB['lat'].attrs= {
                        'grid_name': 'isea',
                        'resolution': resolution,
                        'aperture': 7,
                        'topology': 'h',
                        'mp': 12,
                        'trunk': 250000,
                        'epsg': dataset.attrs['assets']['gsdata']['proj:epsg']['epsg']
}
print('Step 2 completed')
start=time.time()
dataset_methodB = dataset_methodB.stack(cell_ids=['lat','lon'],index_cls=xdggs.DGGSIndex)
end=time.time()
print(f'Step 3 completed {end-start}')

In [7]:
dataset_methodB

In [9]:
np.unique(dataset_methodB.cell_ids.data).shape

(35632817,)

In [None]:
compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=2)
dataset_methodB.to_zarr(f'{dataset.attrs['id']}_methodB.zarr',encoding={"band_data": {"compressor": compressor}})

<xarray.backends.zarr.ZarrStore at 0x7fd963febd40>

In [10]:
zarr_MethodB = xr.open_zarr('esa_cci_2004_clipped_resample_4326.tif_methodB.zarr')

In [12]:
zarr_MethodB = zarr_MethodB.drop_indexes('cell_ids').set_xindex('cell_ids',xdggs.DGGSIndex)
zarr_MethodB.xindexes.get('cell_ids')._resolution = 10

In [13]:
zarr_MethodB

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(35956008,)","(140454,)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 274.32 MiB 1.07 MiB Shape (35956008,) (140454,) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",35956008  1,

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(35956008,)","(140454,)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(35956008,)","(140454,)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 274.32 MiB 1.07 MiB Shape (35956008,) (140454,) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",35956008  1,

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(35956008,)","(140454,)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 274.32 MiB 1.07 MiB Shape (1, 35956008) (1, 140454) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",35956008  1,

Unnamed: 0,Array,Chunk
Bytes,274.32 MiB,1.07 MiB
Shape,"(1, 35956008)","(1, 140454)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [15]:
# Some lon, lat reference.
#[(-54.169803910617055, 77.33133137161553),
#       (-54.169803910617055, 77.31750671697031),
#       (-54.169803910617055, 77.3036820623251) ]

zarr_MethodB.dggs.sel_latlon(longitude=[77.33133137161553,77.3036820623251],latitude=[-54.169803910617055])

[[ 77.33133137 -54.16980391]
 [ 77.30368206 -54.16980391]]
cells id: {'cell_ids': array([1223378460, 1223260805])} <class 'dict'>
[False False False ... False False False]


Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16 B 16 B Shape (2,) (2,) Dask graph 1 chunks in 3 graph layers Data type float64 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16 B 16 B Shape (2,) (2,) Dask graph 1 chunks in 3 graph layers Data type float64 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(1, 2)","(1, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16 B 16 B Shape (1, 2) (1, 2) Dask graph 1 chunks in 3 graph layers Data type float64 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,16 B,16 B
Shape,"(1, 2)","(1, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [16]:
geojson = {
  "coordinates": [
    [[-2.4609375,56.5804494],
     [-1.7578125,51.2053036],
    [3.6914063,51.5343963],
      [ 3.1640625,57.0612533],
      [-2.4609375,56.2889971],
      [-2.4609375,56.5804494]]],
   "type": "Polygon"
}
zarr_MethodB.dggs.dggrid_polygon_for_extent(geojson,'4326')

cells id: {'cell_ids': array([291936822, 291953627, 291953628, ..., 564998729, 564998730,
       565015536])} <class 'dict'>
[False False False ... False False False]


Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(159632,)","(6586,)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.22 MiB 51.45 kiB Shape (159632,) (6586,) Dask graph 31 chunks in 3 graph layers Data type float64 numpy.ndarray",159632  1,

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(159632,)","(6586,)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(159632,)","(6586,)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.22 MiB 51.45 kiB Shape (159632,) (6586,) Dask graph 31 chunks in 3 graph layers Data type float64 numpy.ndarray",159632  1,

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(159632,)","(6586,)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(1, 159632)","(1, 6586)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.22 MiB 51.45 kiB Shape (1, 159632) (1, 6586) Dask graph 31 chunks in 3 graph layers Data type float64 numpy.ndarray",159632  1,

Unnamed: 0,Array,Chunk
Bytes,1.22 MiB,51.45 kiB
Shape,"(1, 159632)","(1, 6586)"
Dask graph,31 chunks in 3 graph layers,31 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
