In [29]:
import shutil
import laspy
import numpy as np

from pathlib import Path
from tqdm import tqdm

In [30]:
CLEAR_DIRS = True
COLLAPSE_CLASSES = True
SPLIT_POINTCLOUDS = True

In [40]:
ehydro_semantics_orig = {
    'ground': 0,
    'track': 1,
    'road': 2,
    'water': 3,
    'shrubs': 4,
    'trees': 5,
    'buildings': 6,
    'misc': 7,
    'power_lines': 8
}

ehydro_semantics_comp = {
    'ground': 0,
    'track': 1,
    'water': 2,
    'shrubs': 3,
    'trees': 4,
    'misc': 5
}

In [32]:
ehydro_dataset = Path.home() / 'Panoramix3D_data' / 'datasets' / 'EHydroDataset'

hybrid_plots_dir = ehydro_dataset / 'raw' / 'hybrid_plots'
real_plots_dir = ehydro_dataset / 'raw' / 'real_plots'

train_raw_dir = hybrid_plots_dir / 'output'
val_raw_dir = real_plots_dir / 'val'
test_raw_dir = real_plots_dir / 'test'

processed_dir = ehydro_dataset / 'processed'
train_dir = processed_dir / 'train'
val_dir = processed_dir / 'val'
test_dir = processed_dir / 'test'

if CLEAR_DIRS:
    if processed_dir.exists():
        shutil.rmtree(processed_dir)

train_dir.mkdir(parents=True, exist_ok=True)
val_dir.mkdir(parents=True, exist_ok=True)
test_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def chunkerize_four(las_data):
    xy = np.stack([las_data.x, las_data.y], axis=1)
    center = xy.mean(axis=0)

    return [
        (xy[:, 0] > center[0]) & (xy[:, 1] > center[1]),
        (xy[:, 0] < center[0]) & (xy[:, 1] > center[1]),
        (xy[:, 0] < center[0]) & (xy[:, 1] < center[1]),
        (xy[:, 0] > center[0]) & (xy[:, 1] < center[1])
    ]

def collapse_classes(las_data):
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['ground']] = ehydro_semantics_comp['ground']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['track']] = ehydro_semantics_comp['track']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['road']] = ehydro_semantics_comp['track']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['water']] = ehydro_semantics_comp['water']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['shrubs']] = ehydro_semantics_comp['shrubs']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['trees']] = ehydro_semantics_comp['trees']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['buildings']] = ehydro_semantics_comp['misc']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['misc']] = ehydro_semantics_comp['misc']
    las_data.semantic_gt[las_data.semantic_gt == ehydro_semantics_orig['power_lines']] = ehydro_semantics_comp['misc']

In [34]:
def process_split(raw_dir: Path, dest_dir: Path):
    files = list(raw_dir.glob('*.las'))
    for file in tqdm(files):
        with laspy.open(file) as las:
            las_data = las.read()
            if COLLAPSE_CLASSES:
                collapse_classes(las_data)
            if SPLIT_POINTCLOUDS:
                chunks = chunkerize_four(las_data)
                for i, mask in enumerate(chunks):
                    chunk = las_data.points[mask]
                    out = laspy.create(point_format=las_data.point_format, file_version=las_data.header.version)
                    out.header.scales = las_data.header.scales
                    out.header.offsets = np.array([0.0, 0.0, 0.0])
                    out.points = chunk

                    x = chunk.x
                    y = chunk.y
                    z = chunk.z

                    out.x = x - np.array(x).min()
                    out.y = y - np.array(y).min()
                    out.z = z - np.array(z).min()

                    _, out.instance_gt = np.unique(chunk.instance_gt, return_inverse=True)
                    out.write(dest_dir / f"{file.stem}_chunk_{i}.las")
            else:
                las_data.write(dest_dir / file.name)

In [35]:
process_split(train_raw_dir, train_dir)
process_split(val_raw_dir, val_dir)
process_split(test_raw_dir, test_dir)

100%|██████████| 660/660 [00:16<00:00, 40.16it/s]
100%|██████████| 32/32 [00:00<00:00, 82.50it/s]
0it [00:00, ?it/s]


In [61]:
ehydro_semantics = ehydro_semantics_comp if COLLAPSE_CLASSES else ehydro_semantics_orig

errors = []

for split_name, split_dir in [('train', train_dir), ('val', val_dir), ('test', test_dir)]:
    files = list(split_dir.glob('*.las'))
    print(f"\n=== Verificando {split_name} ({len(files)} archivos) ===")
    
    for las_file in tqdm(files):
        las = laspy.read(las_file)
        
        # Verificar puntos NO-tree
        non_tree_mask = las.semantic_gt != ehydro_semantics['trees']
        non_tree_instance_invalid = (las.instance_gt[non_tree_mask] != 0).any()
        non_tree_species_invalid = (las.species_gt[non_tree_mask] != 0).any()
        
        if non_tree_instance_invalid:
            errors.append(f"{split_name}/{las_file.name}: Puntos NO-tree con instance_gt != 0")
        if non_tree_species_invalid:
            errors.append(f"{split_name}/{las_file.name}: Puntos NO-tree con species_gt != 0")
        
        # Verificar puntos tree
        tree_mask = las.semantic_gt == ehydro_semantics['trees']
        if tree_mask.any():
            tree_instance_invalid = (las.instance_gt[tree_mask] <= 0).any()
            tree_species_invalid = (las.species_gt[tree_mask] <= 0).any()
            
            if tree_instance_invalid:
                errors.append(f"{split_name}/{las_file.name}: Puntos tree con instance_gt == 0")
            if tree_species_invalid:
                errors.append(f"{split_name}/{las_file.name}: Puntos tree con species_gt == 0")

if errors:
    print(f"\n❌ ERRORES ENCONTRADOS ({len(errors)}):")
    for err in errors:
        print(f"  - {err}")
else:
    print("\n✅ Todas las nubes pasaron la validación")


=== Verificando train (2640 archivos) ===


100%|██████████| 2640/2640 [00:00<00:00, 2717.24it/s]



=== Verificando val (128 archivos) ===


100%|██████████| 128/128 [00:00<00:00, 2826.63it/s]



=== Verificando test (0 archivos) ===


0it [00:00, ?it/s]


✅ Todas las nubes pasaron la validación



