# Session 11: Data Processing

In [1]:
import os

# For parallel processing
import parsl
import parsl
from parsl import python_app
from parsl.config import Config
from parsl.channels import LocalChannel
from parsl.executors import HighThroughputExecutor
from parsl.providers import LocalProvider

# helpers
from grouputils import initialize_rasterizer
from grouputils import plot_tiles

## The `RasterTiler` class

The purpose of the RasterTiler class is to read in tiled vector data (the out put of staging), and create raster data from each vector tile.

![](https://raw.githubusercontent.com/PermafrostDiscoveryGateway/viz-raster/develop/docs/images/raster_tldr.png)

The `RasterTiler` takes the same config as the `TileStager` class, but we add a few more options for rasterization.

In [2]:
iwp_rasterizer = initialize_rasterizer("/home/jclark/example-data")

Here are explanations of all the options for each statistic (this can also be found in `help(pdgstaging.ConfigManager)` )

* name : `str` The name of the statistic. Can be anything but must be unique.

* weight_by : `'count' or 'area'` The weighting method for the statistic. Options
    are 'count' and 'area'. 'count' indicates that the statistic is calculated
    based on the number of polygons in each cell (location is identified by the
    centroid of the polygon). 'area' indicates that the statistic is calculated
    based on the area of the polygons that cover each cell.

* property : `str` The name of the property in the vector file to calculate the
    statistic for. Besides the properties that are available from the input
    vector data, the following keywords can be used:

  * 'centroids_per_pixel' : The number of polygons with centroids that fall in
      the cell/pixel. (Only available if weight_by is 'count')

  * 'area_within_pixel' : The area of the polygon that falls within a given
      cell/pixel, in the units of the CRS. (Only available if weight_by is
      'area')

  * 'area_per_pixel_area' : Same as 'area_within_pixel', but divided by the
      area of the cell/pixel. (Only available if weight_by is 'area')

* aggregation_method : `str` The function to be applied to the property. The
    vector data will first be grouped into cells, then the aggregation method
    will be used to summarize the given property in the cell. Method can be any
    method allowed in the 'func' property of the panda's aggregate method, e.g.
    'sum', 'count', 'mean', etc.

* resampling_method : `str` The resampling method to use when combining raster
    data from child tiles into parent tiles. See rasterio's Resampling Methods
    for list of the available methods.

* val_range : `str` A min and max value for the statistic. This is used for
    consistency when mapping the color palette to the pixel values during web
    tile image generation. When a min or max value within a val_range is set to
    None, then a min or max value will be calculated for the each z-level for
    which geotiffs are created.

* palette : `list of str` A list of colors to use for the color palette (for web-tiles)

* z_config : `str` A dict of config options specific to each z-level. Currently, 
    only setting a val_range is supported. Eventually, this could be used to
    set z-specific tile sizes and color palettes.


Like the `TileStager`, we can rasterize the tiles that we've created individually or as a batch, using either:


The `RasterTiler` also has a method to rasterize an arbitary bunch of tiles at once:

In [4]:
# Let's look at the data we intend to rasterize
staged_paths = iwp_rasterizer.tiles.get_filenames_from_dir('staged')

There are 2249 tiles to rasterize, ranging in size from 96.0 kb to 15084.0 kb. The total size of the data to rasterize is 6.34 GB.


In [5]:
# The files are small so rasterizing one tile is pretty fast. Let's rasterize
# the first tile.

iwp_rasterizer.rasterize_vector(staged_paths[0])

Tile(x=909, y=1059, z=13)

In [6]:
# However, since there are thousands of tiles to rasterize, it is much faster to
# rasterize them in parallel. Let's delete the files that we just created:
os.system(f'rm -rf {iwp_rasterizer.config.get("dir_geotiff")}')
os.system(f'rm {iwp_rasterizer.config.get("filename_rasterization_events")}')
os.system(f'rm {iwp_rasterizer.config.get("filename_rasters_summary")}')

0

## Rasterize in Parallel

In [7]:
# Because rasterization is relatively quick, we want each parsl "task" to process a batch of tiles.
def make_batch(items, batch_size):
    """
    Create batches of a given size from a list of items.
    """
    return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

In [8]:
# We can try 50 tiles at a time.
batch_size = 50
batches = make_batch(staged_paths, batch_size)

There are 45 batches to rasterize, each batch has a max of 50 tiles.


In [11]:
# Set up Parsl and logging again:
activate_env = 'workon scomp'
htex_local = Config(
    executors=[
        HighThroughputExecutor(
            label="htex_local",
            worker_debug=False,
            cores_per_worker=1,
            max_workers=26,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=1,
                max_blocks=20,
                worker_init=activate_env
            )
        )
    ],
)
parsl.clear()
parsl.load(htex_local)

<parsl.dataflow.dflow.DataFlowKernel at 0x7fc96147dc10>

In [12]:
# Make a Parsl app that uses the rasterize_vectors method
@python_app
def rasterize(staged_paths, rasterizer):
    """
    Rasterize a batch of vector files
    """
    return rasterizer.rasterize_vectors(staged_paths, make_parents=False)

In [13]:
# Rasterize the batches in parallel

iwp_rasterizer = initialize_rasterizer("/home/jclark/example-data")

app_futures = []
for batch in batches:
    app_future = rasterize(batch, iwp_rasterizer)
    app_futures.append(app_future)

# Don't continue to print message until all tiles have been rasterized
[app_future.result() for app_future in app_futures]

htex_local.executors[0].shutdown()
parsl.clear()

In [14]:
# Now we should have just as many GeoTIFF files as we do vector tiles.
# Let's check that.
geotiff_paths = iwp_rasterizer.tiles.get_filenames_from_dir('geotiff')
print(len(geotiff_paths))


2249
