In [34]:
import pandas as pd
import numpy as np
filepath = "/var/data/cgaydon/data/202110_building_val/metadata/emprises_reprises_nbreBatiSurdetection.csv"
df_split = pd.read_csv(filepath)
desc = df_split.groupby("split").describe(include=np.number)
desc = desc.transpose()
desc

Unnamed: 0,split,test,train,val
port,count,15.0,120.0,15.0
port,mean,0.066667,0.008333,0.0
port,std,0.258199,0.091287,0.0
port,min,0.0,0.0,0.0
port,25%,0.0,0.0,0.0
port,50%,0.0,0.0,0.0
port,75%,0.0,0.0,0.0
port,max,1.0,1.0,0.0
nb_vehicul,count,15.0,120.0,15.0
nb_vehicul,mean,8.866667,4.341667,2.4


In [55]:
import os.path as osp
from pathlib import Path
from typing import List, Union

import laspy
import numpy as np
import torch
from torch.utils.data.dataloader import default_collate
from torch_geometric.data import Batch, Data, Dataset

import shapefile
import pandas as pd
from torch_geometric.transforms import BaseTransform

def load_las_data(filepath):
    """Load a cloud of points and its labels. base shape: [n_points, n_features].
    Warning: las.x is in meters, las.X is in centimeters.
    """
    las = laspy.read(filepath)
    pos = np.asarray(
        [
            las.x,
            las.y,
            las.z,
        ],
        dtype=np.float32,
    )
    x = np.asarray(
        [
            las.intensity,
            las.return_num,
            las.num_returns,
        ],
        dtype=np.float32,
    )
    pos = pos.transpose()
    x = x.transpose()
    y = las.classification.astype(np.int)
    tile_id = Path(filepath).stem

    return Data(
        pos=pos,
        x=x,
        y=y,
        filepath=filepath,
        tile_id=tile_id,
    )

class MakeBuildingTargets(BaseTransform):
    """
    Pass from multiple classes to simpler Building/Non-Building labels.
    Initial classes: [  1,   2,   6 (detected building, no validation),  19 (valid building),  20 (surdetection, unspecified),
    21 (building, forgotten), 104, 110 (surdetection, others), 112 (surdetection, vehicule), 114 (surdetection, others), 115 (surdetection, bridges)]
    Final classes: 0 (non-building), 1 (building)
    """

    def __call__(self, data: Data):
        buildings_idx = (data.y == 19) | (data.y == 21) | (data.y == 6)
        data.y[buildings_idx] = 1
        data.y[~buildings_idx] = 0
        return data

In [81]:
def describe(filepath: str):
    """ Return a dict of descriptor of a LAS. """

    data = load_las_data(filepath)
    data = MakeBuildingTargets()(data)
    log = {}
    log.update({"filepath" : data.tile_id})
    log.update({"filepath" : data.filepath})
    log.update({"num_points":data.y.shape[0]})
    log.update({"pos.shape":data.pos.shape})
    log.update({"x.shape":data.x.shape})
    log.update({"y.shape":data.y.shape})
    try:
        low = data.pos[:, :2].min(0)
        high = data.pos[:, :2].max(0)
        log.update({"min_xy_meters":str(low)})
        log.update({"max_xy_meters":str(high)})
        log.update({"range_xy_meters":str(high - low)})
    except:
        print(f"min cannot be calculated for {data.filepath}")
    
    log.update({"PercBuilding": f"{data.y.mean():.2%}"})
    return log

logs = []
for filepath in df_split[df_split.split == "train"].file_path.values:
    log = describe(filepath)
    print(log)
    logs.append(log)
    

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = las.classification.astype(np.int)


{'filepath': '/var/data/cgaydon/data/202110_building_val/trainvaltest/880000_6648000.las', 'num_points': 38436026, 'pos.shape': (38436026, 3), 'x.shape': (38436026, 3), 'y.shape': (38436026,), 'min_xy_meters': '[ 880000. 6647000.]', 'max_xy_meters': '[ 881000. 6648000.]', 'range_xy_meters': '[1000. 1000.]', 'PercBuilding': '0.00%'}
{'filepath': '/var/data/cgaydon/data/202110_building_val/trainvaltest/861000_6650000.las', 'num_points': 13555062, 'pos.shape': (13555062, 3), 'x.shape': (13555062, 3), 'y.shape': (13555062,), 'min_xy_meters': '[ 861000. 6649000.]', 'max_xy_meters': '[ 862000. 6650000.]', 'range_xy_meters': '[1000. 1000.]', 'PercBuilding': '0.01%'}
{'filepath': '/var/data/cgaydon/data/202110_building_val/trainvaltest/877000_6601000.las', 'num_points': 29344149, 'pos.shape': (29344149, 3), 'x.shape': (29344149, 3), 'y.shape': (29344149,), 'min_xy_meters': '[ 877000. 6600000.]', 'max_xy_meters': '[ 878000. 6601000.]', 'range_xy_meters': '[1000. 1000.]', 'PercBuilding': '0.09%'

In [83]:
df_logs = pd.DataFrame(data=logs)
df_logs.to_csv("../logs/dataset_description.csv")

In [88]:
df_logs["range_xy_meters"].unique()

array(['[1000. 1000.]', '[700.4375 999.5   ]', '[ 999.5 1000. ]',
       '[999.5 999.5]', '[ 368.5 1000. ]', '[ 695.25 1000.  ]',
       '[1000.   999.5]', '[ 681.875 1000.   ]', '[ 329.375 1000.   ]',
       '[ 342.5625 1000.    ]', '[ 999.5625 1000.    ]',
       '[1000.   977.5]', '[999.5625 999.5   ]'], dtype=object)

In [99]:
df_logs[df_logs.filepath.str.endswith("846000_6610000.las")].values

array([['/var/data/cgaydon/data/202110_building_val/trainvaltest/846000_6610000.las',
        17782963, (17782963, 3), (17782963, 3), (17782963,),
        '[ 846000. 6609000.]', '[ 847000.  6609999.5]',
        '[1000.   999.5]', '3.95%']], dtype=object)

In [98]:
print(df_logs[df_logs.filepath.str.endswith("846000_6610000.las")].filepath.values)

['/var/data/cgaydon/data/202110_building_val/trainvaltest/846000_6610000.las']
