In [None]:
import gc
import glob
import os

import rioxarray as rio
from joblib import Parallel, delayed
from rasterio.enums import Resampling
from rioxarray.merge import merge_arrays
from tqdm.notebook import tqdm

# from WBT.whitebox_tools import WhiteboxTools

# Reprojecting and merging the Norwegian national 10 m terrain dataset

***Note:** This notebook must be run on a machine with plenty of RAM. If the data are processed as 16-bit integers, peak memory consumption is around 120 GB. For 32-bit floats (recommended) it's 240 GB, which will crash the "El Gordo" machine. The 1 TB machine handles everything easily.*

For hydrological calculations we need a good quality DEM. Kartverket's **10 m DTM** is probably the best choice in most cases: the 1 m data is not available everywhere and is unnecessarily detailed, and a DTM is better than a DOM for hydrology (see [here](https://hoydedata.no/LaserInnsyn/help_no/index.htm?context=130)). The 10 m DTM can be downloaded in 50 x 50 km tiles, split across three UTM zones:

 * [UTM zone 32](https://kartkatalog.geonorge.no/metadata/dtm-10-terrengmodell-utm32/fd851873-f363-46f9-9fc6-bb1b403575df)
 * [UTM zone 33](https://kartkatalog.geonorge.no/metadata/dtm-10-terrengmodell-utm33/dddbb667-1303-4ac5-8640-7ec04c0e3918)
 * [UTM zone 35](https://kartkatalog.geonorge.no/metadata/dtm-10-terrengmodell-utm35/294c21ca-eb83-49b9-8861-bb1595ce8768)
 
This notebook first reprojects each tile to UTM Zone 33N (EPSG 25833) and merges them again into a single dataset for the entire country. Arrays are also compressed using the LZW algorithm to save space.

The national 10 m dataset for the whole country is then downsampled using bilinear interpolation to create 20 m and 40 m resolution versions. These will be useful for testing and they should also be sufficient for catchment delineation by themselves.

**Note:** a **conformal projection** (such as EPSG 3395; WGS 84 World Mercator) is best for watershed delineation, but UTM Zone 33N will probably be good enough here.

**Note 2:** I originally tried converting all the grids to 16-bit integer type as, with compression, this dramatically reduces disk space and memory requirements. However, it also leads to difficulties resolving flow direction flat areas, and it's ultimately more trouble than it's worth. I recommend preserving the original 32-bit floats where possible.

## 1. Adding large files to the JupyterHub

Usually it is possible to upload files to the Hub by dragging and dropping from your local file system via your browser. However, for very large files (multiple GB), a faster and more robust solution is to use `gdown` and Google Drive.

The 10 m DTM tiles for each UTM zone are large. First, zip the files you want to upload (make sure you create a .zip archive and not a .7z). Then upload the zip to your Google Drive and make the file public (`Sharing > Anyone with the link can view`) and copy the sharable link. From a terminal on JupyterHub, cd into the folder where you want to add the file and run the following:

    gdown SHARABLE_LINK_COPIED_FROM_GOOGLE_DRIVE --fuzzy
    
This should quickly add the data to the Hub. You can unzip the file from the command line using `unzip filename.zip`.

## 2. User input

In [None]:
# UTM zones to process
zone_list = [32, 33, 35]

# Properties for mosaic
dst_crs = "EPSG:25833"
bbox = (-80000, 6440000, 1122000, 7950000)  # xmin, ymin, xmax, ymax
dst_fold = f"/home/jovyan/shared/01_datasets/spatial/dtm10_proj_utm33"
no_data_val = -32767
dst_dtype = "float32"  # Rasterio dtypes: https://test2.biogeo.ucdavis.edu/rasterio/_modules/rasterio/dtypes.html
dst_res = 10

# Set values <= 0 to NaN?
neg_to_nan = False

# Rasters to create
nor_10m_dtm = r"/home/jovyan/shared/01_datasets/spatial/dtm_merged_utm33/dtm_10m/norway_kartverket_10m_dtm_utm_z33.tif"
nor_20m_dtm = r"/home/jovyan/shared/01_datasets/spatial/dtm_merged_utm33/dtm_20m/norway_kartverket_20m_dtm_utm_z33.tif"
nor_40m_dtm = r"/home/jovyan/shared/01_datasets/spatial/dtm_merged_utm33/dtm_40m/norway_kartverket_40m_dtm_utm_z33.tif"

# Number of workers if choose to run section 3 in parallel
n_jobs = 35

## 3. Reproject to UTM Zone 33N

The cell below processes data sequentially. Good for machines with "smaller" amounts of memory (e.g. 240 GB).

In [None]:
# for zone in tqdm(zone_list, desc="Looping over zones"):
#     search_path = f"/home/jovyan/shared/01_datasets/spatial/dtm_10_raw/utm_{zone}/*.tif"
#     flist = sorted(glob.glob(search_path))
#     dst_list = [os.path.join(dst_fold, os.path.split(fname)[1]) for fname in flist]

#     for idx, src_path in enumerate(tqdm(flist, desc="Looping over files")):
#         dst_path = dst_list[idx]
#         rds = rio.open_rasterio(src_path, mask_and_scale=True)

#         if neg_to_nan:
#             rds = rds.where(rds > 0)

#         rds.rio.write_nodata(no_data_val, encoded=True, inplace=True)
#         rds = rds.rio.reproject(
#             dst_crs,
#             resolution=dst_res,
#             nodata=no_data_val,
#             resampling=Resampling.bilinear,
#         )
#         rds.rio.to_raster(dst_path, compress="lzw", BIGTIFF="IF_SAFER", dtype=dst_dtype)
#         rds.close()
#         del rds
#         gc.collect()

On machines with lots of memory (e.g. 1 TB), the following parallel version is faster.

In [None]:
%%time


def reproject(src_path, dst_path, neg_to_nan, no_data_val, dst_crs, dst_res, dst_dtype):
    """ """
    rds = rio.open_rasterio(src_path, mask_and_scale=True)

    if neg_to_nan:
        rds = rds.where(rds > 0)

    rds.rio.write_nodata(no_data_val, encoded=True, inplace=True)
    rds = rds.rio.reproject(
        dst_crs,
        resolution=dst_res,
        nodata=no_data_val,
        resampling=Resampling.bilinear,
    )
    rds.rio.to_raster(dst_path, compress="lzw", BIGTIFF="IF_SAFER", dtype=dst_dtype)
    rds.close()
    del rds
    gc.collect()

    return None


for zone in tqdm(zone_list, desc="Looping over zones"):
    search_path = f"/home/jovyan/shared/01_datasets/spatial/dtm_10_raw/utm_{zone}/*.tif"
    flist = sorted(glob.glob(search_path))
    dst_list = [os.path.join(dst_fold, os.path.split(fname)[1]) for fname in flist]

    Parallel(n_jobs=n_jobs)(
        delayed(reproject)(
            src_path,
            dst_list[idx],
            neg_to_nan,
            no_data_val,
            dst_crs,
            dst_res,
            dst_dtype,
        )
        for idx, src_path in enumerate(flist)
    )

## 4. Merge to a single dataset

In [None]:
%%time

search_path = f"{dst_fold}/*.tif"
flist = sorted(glob.glob(search_path))
print(len(flist), "files to process.")

print("Opening files...")
srcs = [rio.open_rasterio(fpath, mask_and_scale=True, cache=False) for fpath in flist]

print("Merging tiles...")
rds = merge_arrays(srcs, bounds=bbox, res=dst_res)

print("Saving...")
rds.rio.write_nodata(no_data_val, inplace=True)
rds.rio.to_raster(
    nor_10m_dtm,
    compress="lzw",
    BIGTIFF="YES",
    tiled=True,
    dtype=dst_dtype,
)
srcs = [src.close() for src in srcs]
rds.close()
del srcs, rds
gc.collect()

print("Done.")

In [None]:
# # Alternative using WBT. May be more memory efficient,
# # but not yet tested as the above seems to work
# wbt = WhiteboxTools()
# wbt.set_verbose_mode(False)
# wbt.set_compress_rasters(True)
# wbt.set_working_dir(dst_fold)

# wbt.mosaic(
#     nor_10m_dtm,
#     inputs=None,
#     method="bilinear",
# )

## 5. Downsampling

In [None]:
%%time

print("Downsampling to 20m...")
rds = rio.open_rasterio(nor_10m_dtm, mask_and_scale=True, cache=False)
upscale_factor = 0.5
width = int(rds.rio.width * upscale_factor)
height = int(rds.rio.height * upscale_factor)

rds = rds.rio.reproject(
    rds.rio.crs,
    shape=(height, width),
    resampling=Resampling.bilinear,
)

rds.rio.to_raster(
    nor_20m_dtm, compress="lzw", BIGTIFF="IF_SAFER", tiled=True, dtype=dst_dtype
)
rds.close()
del rds
gc.collect()

In [None]:
%%time

print("Downsampling to 40m...")
rds = rio.open_rasterio(nor_10m_dtm, mask_and_scale=True, cache=False)
upscale_factor = 0.25
width = int(rds.rio.width * upscale_factor)
height = int(rds.rio.height * upscale_factor)

rds = rds.rio.reproject(
    rds.rio.crs,
    shape=(height, width),
    resampling=Resampling.bilinear,
)

rds.rio.to_raster(
    nor_40m_dtm, compress="lzw", BIGTIFF="IF_SAFER", tiled=True, dtype=dst_dtype
)
rds.close()
del rds
gc.collect()