# Laserfarm: LiDAR point cloud analysis for macro-ecology

## Configuration

### User parameters

Defines the parameters that can be set by users when executing the workflow.

In [None]:
# (DO NOT containerize this cell)

# 
param_laz_urls = ["https://basisdata.nl/hwh-ahn/AHN6/01_LAZ/AHN6_2025_C_168000_520000.LAZ", "https://basisdata.nl/hwh-ahn/AHN6/01_LAZ/AHN6_2025_C_168000_519000.LAZ"]

# Data handling parameters
param_minio_endpoint = 'scruffy.lab.uvalight.net:9000'
param_minio_public_bucket = 'naa-vre-public'
param_minio_virtual_lab_bucket = 'naa-vre-laserfarm'

# Laserfarm parameters
param_feature_name = 'perc_95_normalized_height'
param_validate_precision = '0.001'
param_tile_mesh_size = '10.'
param_filter_type = 'select_equal'
param_attribute = 'raw_classification'
param_min_x = '-113107.81'  # EPSG:28992
param_max_x = '398892.19'  # EPSG:28992
param_min_y = '214783.87'  # EPSG:28992
param_max_y = '726783.87'  # EPSG:28992
param_n_tiles_side = '512'
param_apply_filter_value = '1'
param_processed_files_record_file = 'processed_files_log.txt'

In [None]:
# Secrets (DO NOT containerize this cell)
from SecretsProvider import SecretsProvider
from getpass import getpass

secrets_provider = SecretsProvider(input_func=getpass)
secret_minio_access_key = secrets_provider.get_secret('secret_minio_access_key')
secret_minio_secret_key = secrets_provider.get_secret('secret_minio_secret_key')

### Dependencies

The following cells install extra dependencies that are not included in the Laserfarm flavor by default, and import the libraries used in the notebook.

In [None]:
# (DO NOT containerize this cell)

import json
import os

from laserfarm import DataProcessing, GeotiffWriter, Retiler
from laserfarm.remote_utils import get_wdclient, list_remote
from minio import Minio
from minio.error import S3Error
import laspy
import io

### Global configuration

The following variable are used throughout the code. They are intended to be edited by developers who the notebook.

In [None]:
# (DO NOT containerize this cell)

conf_local_tmp = '/tmp/data'
conf_local_path_raw = os.path.join(conf_local_tmp, 'raw')
conf_local_path_split = os.path.join(conf_local_tmp, 'split')
conf_local_path_retiled = os.path.join(conf_local_tmp, 'retiled')
conf_local_path_targets = os.path.join(conf_local_tmp, 'targets')
conf_local_path_geotiff = os.path.join(conf_local_tmp, 'geotiff')
conf_local_path_figures = os.path.join(conf_local_tmp, 'figures')

## Workflow steps

In [None]:
# Processed Files Tracker
def get_minio_file_as_set(bucket_name: str, object_name: str) -> set[str]:
    response = None
    try:
        response = minio_client.get_object(bucket_name, object_name)
        content = response.data.decode("utf-8")
        return {line.strip() for line in content.splitlines() if line.strip()}
    
    except S3Error as e:
        if e.code == "NoSuchKey":
            return set()
        raise e
        
    finally:
        if response:
            response.close()
            response.release_conn()

minio_client = Minio(
    param_minio_endpoint, 
    access_key=secret_minio_access_key,
    secret_key=secret_minio_secret_key,
    secure=True
)

processed_files  = get_minio_file_as_set(param_minio_virtual_lab_bucket, f"{param_feature_name}/{param_processed_files_record_file}")
files_to_process = list(set(param_laz_urls) - processed_files)
print(f"Found {len(param_laz_urls)} files to process, of which {len(processed_files)} were already processed.")

### Fetch laz files from remote storage

This cell downloads `.laz` files from the remote MinIO storage.

In [None]:
# S1 Fetch laz files
import urllib.request

os.makedirs(conf_local_path_raw, exist_ok=True)

laz_urls = files_to_process

raw_laz_files = []
for laz_url in laz_urls:
    print(f"retrieving file from {laz_url}")
    filename = laz_url.rpartition('/')[-1] 
    file_location = f"{conf_local_path_raw}/{filename}"
    # urllib.request.urlretrieve(laz_url, f"{filename}")
    urllib.request.urlretrieve(laz_url, file_location)

    raw_laz_files.append(file_location)

print(raw_laz_files)

In [None]:
# S3 Retile laz files
# base image: laserfarm

grid_retile = {
    'min_x': float(param_min_x),
    'max_x': float(param_max_x),
    'min_y': float(param_min_y),
    'max_y': float(param_max_y),
    'n_tiles_side': int(param_n_tiles_side),
    }

retiling_input = {
    'setup_local_fs': {
        'input_folder': conf_local_path_split,
        'output_folder': conf_local_path_retiled,
        },
    'set_grid': grid_retile,
    'split_and_redistribute': {},
    'validate': {},
    }

os.makedirs(conf_local_path_retiled, exist_ok=True)
tiles = []

for file in raw_laz_files:
    base_name = os.path.splitext(os.path.basename(file))[0]
    retile_record_filename = os.path.join(
        conf_local_path_retiled,
        f'{base_name}_retile_record.js',
        )
    if not os.path.isfile(retile_record_filename):
        print(f'Retiling {file}')
        retiler = Retiler(file, label=file).config(retiling_input)
        retiler.run()
    else:
        print(
            f'Skipping retiling of {file} because {retile_record_filename} already exists'
            )
    # load filenames from retile record
    with open(retile_record_filename, 'r') as f:
        retile_record = json.load(f)
    
    tiles += retile_record['redistributed_to']
    

print(retile_record)

### Extract features from tiles

Run the feature extraction for each tile. The features are extracted using [laserchicken](https://github.com/eEcoLiDAR/laserchicken).

In [None]:
# S5 Extract features
# base image: laserfarm

feature_files = []

for i, tile in enumerate(tiles):
    grid_feature = {
        'min_x': float(param_min_x),
        'max_x': float(param_max_x),
        'min_y': float(param_min_y),
        'max_y': float(param_max_y),
        'n_tiles_side': int(param_n_tiles_side),
        }

    feature_extraction_input = {
        'setup_local_fs': {
            'input_folder': conf_local_path_retiled,
            'output_folder': conf_local_path_targets,
            },
        'load': {'attributes': [param_attribute]},
        'normalize': 1,
        'apply_filter': {
            'filter_type': param_filter_type,
            'attribute': param_attribute,
            'value': [int(param_apply_filter_value)],
            #ground surface (2), water (9), buildings (6), artificial objects (26), vegetation (?), and unclassified (1)
            },
        'generate_targets': {
            'tile_mesh_size': float(param_tile_mesh_size),
            'validate': True,
            'validate_precision': float(param_validate_precision),
            **grid_feature
            },
        'extract_features': {
            'feature_names': [param_feature_name],
            'volume_type': 'cell',
            'volume_size': float(param_tile_mesh_size),
            },
        'export_targets': {
            'attributes': [param_feature_name],
            'multi_band_files': False,
            },
        }
    idx = (tile.split('_')[1:])

    target_file = os.path.join(
        conf_local_path_targets, param_feature_name, tile + '.ply'
        )
    print(target_file)

    if not os.path.isfile(target_file):
        processing = DataProcessing(tile, tile_index=idx, label=tile).config(
            feature_extraction_input
            )
        processing.run()
    else:
        print(
            f'Skipping features extraction for {tile} ({i + 1} of {len(tiles)}) because {target_file} already exists'
            )

    feature_files.append(target_file)

print(feature_files)

In [None]:
### Save to MinIO
def get_minio_file_as_set() -> set[str]:
    response = None
    try:
        response = minio_client.get_object(param_minio_virtual_lab_bucket, param_processed_files_record_file)
        content = response.data.decode("utf-8")
        return {line.strip() for line in content.splitlines() if line.strip()}
    
    except S3Error as e:
        if e.code == "NoSuchKey":
            return set()
        raise e
        
    finally:
        if response:
            response.close()
            response.release_conn()

def add_to_processed_files_log(newly_processed_files : set[str]):
    files_processed_earlier = get_minio_file_as_set()
    processed_files = files_processed_earlier |  newly_processed_files
    set_as_string = "\n".join(processed_files)
    set_as_bytes = set_as_string.encode("utf-8") 
    minio_client.put_object(
        bucket_name=param_minio_virtual_lab_bucket, 
        object_name=f"{param_feature_name}/{param_processed_files_record_file}", 
        data=io.BytesIO(set_as_bytes), length=len(set_as_bytes))

def copy_to_minio(filepath : str):
    filename = filepath.replace(conf_local_path_targets,'')
    minio_client.fput_object(bucket_name=param_minio_virtual_lab_bucket, file_path=filepath, object_name=filename)
    return

minio_client = Minio(
    param_minio_endpoint, 
    access_key=secret_minio_access_key,
    secret_key=secret_minio_secret_key,
    secure=True
)

processed_laz_filenames = set(laz_urls)
add_to_processed_files_log(processed_laz_filenames)

for filepath in feature_files:
    copy_to_minio(filepath)
