In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import laspy
import random

def process_las_files_split3(
    directory, out_root, chunk_size=100000, tile_name="tile",
    train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42
):
    # Ensure the split ratios sum to 1.0
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    # Collect all .las and .laz files in the directory
    files = [os.path.join(directory, f) for f in os.listdir(directory)
             if f.endswith(".las") or f.endswith(".laz")]
    header = []
    chunk_data_buffer = []
    chunk_counter = 0
    random.seed(seed)

    # Iterate over each LAS file
    for file_idx, file in enumerate(tqdm(files, desc="Loading .las files")):
        try:
            las = laspy.read(file)
            dims = list(las.points.point_format.dimension_names)
            scale = las.header.scale
            offset = las.header.offset
            if not header:
                header = dims
            total_points = len(las.points)
        except Exception as e:
            print(f"\nError reading {file}: {e}")
            continue

        start = 0
        # Process the file in chunks
        while start < total_points:
            end = min(start + chunk_size, total_points)
            chunk_data = []
            valid_chunk = True
            # Extract data for each dimension
            for d in dims:
                try:
                    data_array = np.array(las.points[d][start:end], dtype=float)
                    # Apply scale and offset for coordinates
                    if d in ["X", "Y", "Z"]:
                        idx = ["X", "Y", "Z"].index(d)
                        data_array = data_array * scale[idx] + offset[idx]
                    if data_array.size != (end - start):
                        print(f"\nWarning: Inconsistent point count in chunk {chunk_counter}")
                        valid_chunk = False
                        break
                    chunk_data.append(data_array)
                except Exception as e:
                    print(f"\nError in dimension '{d}': {e}")
                    valid_chunk = False
                    break

            # Skip invalid chunks
            if not valid_chunk or len(chunk_data) != len(dims):
                start = end
                continue

            # Stack and convert chunk data to DataFrame
            chunk = np.vstack(chunk_data).T
            chunk_df = pd.DataFrame(chunk, columns=header)

            # Define feature and label columns
            feature_cols = ["X", "Y", "Z", "intensity", "return_number", "number_of_returns"]
            label_col = "classification"

            # Check if all required columns are present
            if not all(col in chunk_df.columns for col in feature_cols + [label_col]):
                print(f"\nChunk {chunk_counter} skipped – missing columns.")
                start = end
                continue

            # Extract features and labels
            features = chunk_df[feature_cols].to_numpy()
            labels = chunk_df[label_col].to_numpy().astype(np.int32)

            # Store chunk data in buffer
            chunk_data_buffer.append((features, labels, f"{tile_name}_chunk{chunk_counter}"))
            chunk_counter += 1

            start = end

    # If no valid chunks were found, exit
    if not chunk_data_buffer:
        print("\n No valid chunks extracted.")
        return

    # Shuffle and split the chunks into train/val/test
    random.shuffle(chunk_data_buffer)
    n_total = len(chunk_data_buffer)
    n_train = int(n_total * train_ratio)
    n_val = int(n_total * val_ratio)
    n_test = n_total - n_train - n_val

    splits = {
        "train": chunk_data_buffer[:n_train],
        "val": chunk_data_buffer[n_train:n_train + n_val],
        "test": chunk_data_buffer[n_train + n_val:]
    }

    # Save the split chunks to disk
    for split_name, chunks in splits.items():
        split_dir = os.path.join(out_root, split_name)
        os.makedirs(split_dir, exist_ok=True)
        for features, labels, name in chunks:
            points_path = os.path.join(split_dir, f"{name}_points.npy")
            labels_path = os.path.join(split_dir, f"{name}_labels.npy")
            np.save(points_path, features)
            np.save(labels_path, labels)

    # Print summary of the split
    print(f"\nSplit completed:")
    print(f"  Train: {n_train} chunks")
    print(f"  Val:   {n_val} chunks")
    print(f"  Test:  {n_test} chunks")
    print(f"  Saved in: '{out_root}/{{train,val,test}}'")

In [None]:
process_las_files_split3(
    directory="C:/Users/Nicola/Desktop/zurich_laz",
    out_root="./processed_chunks",
    chunk_size=100_000,
    tile_name="tileCH",
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15
)

