# Imports

In [1]:
import numpy as np
from scipy.spatial.transform import Rotation

import plotly.graph_objects as go

from src.data.language_sequence import LanguageSequence
from src.data.point_cloud import PointCloud
from src.networks.scenescript_model import SceneScriptWrapper

# Plotting Lib

In [2]:
UNIT_CUBE_VERTICES = (
    np.array(
        [
            (1, 1, 1),
            (1, 1, -1),
            (1, -1, 1),
            (1, -1, -1),
            (-1, 1, 1),
            (-1, 1, -1),
            (-1, -1, 1),
            (-1, -1, -1),
        ]
    )
    * 0.5
)


UNIT_CUBE_LINES_IDXS = np.array(
    [
        [0, 1],
        [0, 2],
        [0, 4],
        [1, 3],
        [1, 5],
        [2, 3],
        [2, 6],
        [3, 7],
        [4, 5],
        [4, 6],
        [5, 7],
        [6, 7],
    ]
)


PLOTTING_COLORS = {
    "wall": "#FBFAF5",
    "door": "#F7C59F",
    "window": "#53F4FF",
    "bbox": "#CC3FD1",
    "points": "#C7DAE8",
    "trajectory": "#F92A82",
}

In [3]:
def language_to_bboxes(entities):
    """
    Args:
        entities: List[BaseEntity].
    """
    box_definitions = []
    # lookup table
    lookup = {}

    for entity in entities:

        entity_id = int(entity.params["id"])
        class_name = entity.COMMAND_STRING[5:]  # remove "make_"

        if entity.COMMAND_STRING == "make_wall":
            height = entity.params["height"]
            thickness = 0.0
            # corners
            corner_a = np.array(
                [
                    entity.params["a_x"],
                    entity.params["a_y"],
                    entity.params["a_z"],
                ]
            )
            corner_b = np.array(
                [
                    entity.params["b_x"],
                    entity.params["b_y"],
                    entity.params["b_z"],
                ]
            )
            length = np.linalg.norm(corner_a - corner_b)

            direction = corner_b - corner_a
            angle = np.arctan2(direction[1], direction[0])
            lookup[entity_id] = {**entity.params, "angle": angle}

            centre = (corner_a + corner_b) * 0.5 + np.array([0, 0, 0.5 * height])
            scale = np.array([length, thickness, height])
            rotation = Rotation.from_rotvec([0, 0, angle]).as_matrix()

        elif entity.COMMAND_STRING in {"make_door", "make_window"}:

            # Find valid wall pointer
            # NOTE: this part differs from the original implementation of this function.
            for key in ["wall_id", "wall0_id", "wall1_id"]:
                wall_id = entity.params.get(key, None)
                wall = lookup.get(wall_id, None)
                if wall is not None:
                    break
            if wall is None:
                continue
            angle, thickness = wall["angle"], wall["thickness"]

            centre = np.array(
                [
                    entity.params["position_x"],
                    entity.params["position_y"],
                    entity.params["position_z"],
                ]
            )
            rotation = Rotation.from_rotvec([0, 0, angle]).as_matrix()
            scale = np.array(
                [
                    entity.params["width"],
                    thickness,
                    entity.params["height"],
                ]
            )

        elif entity.COMMAND_STRING == "make_bbox":

            centre = np.array(
                [
                    entity.params["position_x"],
                    entity.params["position_y"],
                    entity.params["position_z"],
                ]
            )
            rotation = Rotation.from_rotvec([0, 0, entity.params["angle_z"]]).as_matrix()
            scale = np.array(
                [
                    entity.params["scale_x"],
                    entity.params["scale_y"],
                    entity.params["scale_z"],
                ]
            )
            class_name = entity.params["class"]

        box = {
            "id": entity_id,
            "cmd": entity.COMMAND_STRING,
            "class": class_name,
            "centre": centre,
            "rotation": rotation,
            "scale": scale,
        }
        box_definitions.append(box)

    return box_definitions


def plot_box_wireframe(box):
    box_verts = UNIT_CUBE_VERTICES * box["scale"]
    box_verts = (box["rotation"] @ box_verts.T).T
    box_verts = box_verts + box["centre"]

    lines_x = []
    lines_y = []
    lines_z = []
    for pair in UNIT_CUBE_LINES_IDXS:
        for idx in pair:
            lines_x.append(box_verts[idx, 0])
            lines_y.append(box_verts[idx, 1])
            lines_z.append(box_verts[idx, 2])
        lines_x.append(None)
        lines_y.append(None)
        lines_z.append(None)

    if box["cmd"] == "make_bbox":
        class_name = f"bbox_{box['class']}"
        plot_color = PLOTTING_COLORS["bbox"]
    else:  # wall/door/window
        class_name = box["class"]
        plot_color = PLOTTING_COLORS[class_name]

    wireframe = go.Scatter3d(
        x=lines_x,
        y=lines_y,
        z=lines_z,
        mode="lines",
        name=f"{class_name}_{box['id']}",
        line={
            "color": plot_color,
            "width": 10,
        },
    )

    return wireframe


def plot_point_cloud(point_cloud, max_points_to_plot=50_000):
    if len(point_cloud) > max_points_to_plot:
        print(
            f"The number of points ({len(point_cloud)}) exceeds the maximum that can be reliably plotted."
        )
        print(f"Randomly subsampling {max_points_to_plot} points for the plot.")
        sampled = np.random.choice(len(point_cloud), max_points_to_plot, replace=False)
        point_cloud = point_cloud[sampled]

    return go.Scatter3d(
        x=point_cloud[:, 0],
        y=point_cloud[:, 1],
        z=point_cloud[:, 2],
        mode="markers",
        name="Semi-dense Point Cloud",
        marker={
            "size": 1.0,
            "opacity": 0.3,
            "color": PLOTTING_COLORS["points"],
        },
    )


# Main plotting function
def plot_3d_scene(
    language_sequence=None,
    point_cloud=None,
    max_points_to_plot=50_000,
    fig_width=1000,
):

    traces = []
    if point_cloud is not None:
        traces.append(plot_point_cloud(point_cloud, max_points_to_plot))

    if language_sequence is not None:
        boxes = language_to_bboxes(language_sequence.entities)
        for box in boxes:
            traces.append(plot_box_wireframe(box))

    assert traces, "Nothing to visualize."
    fig = go.Figure(data=traces)
    fig.update_layout(
        template="plotly_dark",
        scene={
            "xaxis": {"showticklabels": False, "title": ""},
            "yaxis": {"showticklabels": False, "title": ""},
            "zaxis": {"showticklabels": False, "title": ""},
        },
        width=fig_width,
        height=fig_width // 2,
        scene_aspectmode="data",
        hoverlabel={"namelength": -1},
    )
    fig.show()

In [4]:
from pathlib import Path

ROOT_DIR = Path.cwd().parents[1].resolve()

print(ROOT_DIR)

ckpt_path = ROOT_DIR / ".logs/ckpts/scenescript_model_ase.ckpt"
assert ckpt_path.exists(), f"Checkpoint not found: {ckpt_path}"

data_dir = ROOT_DIR / ".data/semidense_samples"
assert data_dir.exists(), f"Data directory not found: {data_dir}"
print(list(data_dir.glob("*")))

/home/jandu/repos/NBV
[PosixPath('/home/jandu/repos/NBV/.data/semidense_samples/aea'), PosixPath('/home/jandu/repos/NBV/.data/semidense_samples/ase')]


# Load Model + Point Cloud

In [5]:
model_wrapper = SceneScriptWrapper.load_from_checkpoint(ckpt_path).cuda()

  ckpt_dict = torch.load(ckpt_path)


In [6]:
point_cloud_path = data_dir / "ase/ase_examples/0/semidense_points.csv.gz"
assert point_cloud_path.exists(), f"Point cloud file not found: {point_cloud_path}"

In [7]:
point_cloud_obj = PointCloud.load_from_file(point_cloud_path.as_posix())

Loaded #3dPoints: 433426
Kept 144544 points after filtering!


# Run Model

In [8]:
lang_seq = model_wrapper.run_inference(
    point_cloud_obj.points,
    nucleus_sampling_thresh=0.05,  # 0.0 is argmax, 1.0 is random sampling
    verbose=True,
)

Time taken for input encoding: 1.091s
Time taken for autoregressive sampling: 3.470s


In [9]:
print(f"# entities generated: {len(lang_seq.entities)}", "\nFirst 5 entities:", *lang_seq.entities[:5], sep="\n")

# entities generated: 53

First 5 entities:
<src.data.geometries.wall.WallEntity object at 0x7d633a2ea290>
<src.data.geometries.wall.WallEntity object at 0x7d633961d050>
<src.data.geometries.wall.WallEntity object at 0x7d633961c350>
<src.data.geometries.wall.WallEntity object at 0x7d633961d4d0>
<src.data.geometries.wall.WallEntity object at 0x7d633961d550>


In [13]:
print(list(map(lambda entity: type(entity), lang_seq.entities)))

[<class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.wall.WallEntity'>, <class 'src.data.geometries.door.DoorEntity'>, <class 'src.data.geometries.door.DoorEntity'>, <class 'src.data.geometries.door.DoorEntity'>, <class 'src.data.geometries.window.WindowEntity'>, <class 'src.data.geometries.window.WindowEntity'>, <class 'src.data.geometries.window.WindowEntity'>, <class 'src.data.geometries.window.WindowEntity'>, <class 'src.data.geometries.bbox.BboxEntity'>, <class 'src.data.geometries.bbox.BboxEntity'>, <class 'src.data.geometries.bbox.BboxEntity'>, <class 'src.data.geometries.bbox.BboxEntity'>, <class 'src.data.geometries.bbox.BboxEntity'>, <class 'src.data.geometries.bbox.BboxEntity

In [30]:
from collections import Counter, defaultdict
from devtools import pprint

type_counts = Counter(type(e).__name__ for e in lang_seq.entities)

entity_structures = defaultdict(dict)
for e in lang_seq.entities:
    tname = type(e).__name__
    if tname not in entity_structures:
        entity_structures[tname] = {k: type(v).__name__ for k, v in vars(e).items()}
        entity_structures[tname]["params"] = list(vars(e)["params"].keys())


pprint({
    "counts": dict(type_counts),
    "structures": dict(entity_structures)
})

{
    'counts': {
        'WallEntity': 8,
        'DoorEntity': 3,
        'WindowEntity': 4,
        'BboxEntity': 38,
    },
    'structures': {
        'WallEntity': {
            'params': [
                'id',
                'a_x',
                'a_y',
                'a_z',
                'b_x',
                'b_y',
                'b_z',
                'height',
                'thickness',
            ],
        },
        'DoorEntity': {
            'params': [
                'id',
                'position_x',
                'position_y',
                'position_z',
                'width',
                'height',
                'wall0_id',
                'wall1_id',
            ],
            'parent_wall_entity': 'WallEntity',
        },
        'WindowEntity': {
            'params': [
                'id',
                'position_x',
                'position_y',
                'position_z',
                'width',
                'height',
         

In [27]:
bbox_entities = list(filter(lambda e: type(e).__name__ == "BboxEntity", lang_seq.entities))
pprint([vars(bbox_entities[0]), vars(bbox_entities[-2])])


[
    {
        'params': {
            'id': 3000,
            'class': 'mirror',
            'position_x': 1.001638412475586,
            'position_y': -3.2896361351013184,
            'position_z': 1.7710379362106323,
            'angle_z': -3.1416,
            'scale_x': 0.59375,
            'scale_y': 0.03125,
            'scale_z': 0.25,
        },
    },
    {
        'params': {
            'id': 3036,
            'class': 'container',
            'position_x': 1.851637840270996,
            'position_y': 5.610363483428955,
            'position_z': 0.12103807926177979,
            'angle_z': -3.1416,
            'scale_x': 0.28125,
            'scale_y': 0.28125,
            'scale_z': 0.28125,
        },
    },
]


# Visualisation

In [10]:
plot_3d_scene(
    lang_seq,
    point_cloud_obj.points,
    max_points_to_plot=50_000,
    fig_width=1100,
)

The number of points (144544) exceeds the maximum that can be reliably plotted.
Randomly subsampling 50000 points for the plot.


In [11]:
scene_path = ROOT_DIR / ".data/semidense_samples/ase/ase_examples/0"
assert scene_path.exists(), f"Scene path not found: {scene_path}"

In [12]:
from projectaria_tools.projects.ase import readers

scene_language = readers.read_scene_language(f"{scene_path}/ase_scene_language.txt")
trajectory = readers.read_trajectory(f"{scene_path}/trajectory.csv")
instances_map = readers.read_instance_mapping(f"{scene_path}/object_instances_to_classes.json")

# Load semi-dense point cloud
points_df = readers.read_semidense_points(f"{scene_path}/semidense_points.csv.gz")
observations_df = readers.read_observations(f"{scene_path}/semidense_observations.csv.gz")

AttributeError: module 'projectaria_tools.projects.ase.readers' has no attribute 'read_scene_language'

# Entity-Level Analysis for NBV Planning

This section explores how to work with SceneScript entities for Next-Best-View (NBV) planning and Relative Reconstruction Improvement (RRI) computation.

## Understanding SceneScript Entities

SceneScript represents scenes as sequences of **entities** - structured primitives with explicit parameters. Let's analyze the entities generated by the model:

In [None]:
# Analyze entity types and counts
from collections import Counter

entity_types = Counter([e.COMMAND_STRING for e in lang_seq.entities])
print("Entity Type Distribution:")
for entity_type, count in entity_types.items():
    print(f"  {entity_type}: {count}")

# Show detailed parameters for first entity of each type
print("\nExample Entities with Parameters:")
seen_types = set()
for entity in lang_seq.entities:
    if entity.COMMAND_STRING not in seen_types:
        print(f"\n{entity.COMMAND_STRING}:")
        for key, value in entity.params.items():
            print(f"  {key}: {value}")
        seen_types.add(entity.COMMAND_STRING)
        if len(seen_types) >= 3:  # Show first 3 types
            break

## Computing Entity Extents

For NBV planning, we need to know the 3D bounding boxes of each entity. This helps determine which views would best observe specific entities.

In [None]:
# Compute extents for all entities
entity_extents = []
for entity in lang_seq.entities:
    extent = entity.extent()
    entity_extents.append({
        'id': entity.params.get('id', -1),
        'type': entity.COMMAND_STRING,
        'extent': extent,
        'volume': extent['size_x'] * extent['size_y'] * extent['size_z']
    })

# Show extents for first few entities
print("Entity Extents (first 5):")
for i, ext_info in enumerate(entity_extents[:5]):
    print(f"\n{ext_info['type']} (id={ext_info['id']}):")
    print(f"  Min: ({ext_info['extent']['min_x']:.2f}, {ext_info['extent']['min_y']:.2f}, {ext_info['extent']['min_z']:.2f})")
    print(f"  Max: ({ext_info['extent']['max_x']:.2f}, {ext_info['extent']['max_y']:.2f}, {ext_info['extent']['max_z']:.2f})")
    print(f"  Size: ({ext_info['extent']['size_x']:.2f}, {ext_info['extent']['size_y']:.2f}, {ext_info['extent']['size_z']:.2f})")
    print(f"  Volume: {ext_info['volume']:.2f} m³")

## Entity-Aware RRI Computation Concept

For NBV planning, we want to compute **Relative Reconstruction Improvement (RRI)** at the entity level:

### Traditional RRI (VIN-NBV):
```
RRI = (Chamfer_before - Chamfer_after) / Chamfer_before
```

### Entity-Aware RRI (Our Approach):
For each entity $e$:
```
RRI_e = (Error_e_before - Error_e_after) / Error_e_before
```

Where `Error_e` can be:
- **Geometric**: Chamfer distance between predicted and GT entity bounding box
- **Semantic**: IOU between predicted and GT entity parameters
- **Coverage**: Fraction of entity surface observed by cameras

In [None]:
# Demonstrate entity-level error computation (conceptual)
def compute_entity_geometric_error(predicted_entity, gt_entity):
    """
    Compute geometric error between predicted and ground truth entity.

    For walls: Compare corner positions + height
    For doors/windows: Compare center position + width + height
    For bboxes: Compare center + scale + rotation
    """
    pred_extent = predicted_entity.extent()
    gt_extent = gt_entity.extent()

    # Simple metric: Volume difference
    pred_vol = pred_extent['size_x'] * pred_extent['size_y'] * pred_extent['size_z']
    gt_vol = gt_extent['size_x'] * gt_extent['size_y'] * gt_extent['size_z']

    volume_error = abs(pred_vol - gt_vol) / (gt_vol + 1e-6)

    # Center position error
    pred_center = np.array([
        (pred_extent['min_x'] + pred_extent['max_x']) / 2,
        (pred_extent['min_y'] + pred_extent['max_y']) / 2,
        (pred_extent['min_z'] + pred_extent['max_z']) / 2
    ])
    gt_center = np.array([
        (gt_extent['min_x'] + gt_extent['max_x']) / 2,
        (gt_extent['min_y'] + gt_extent['max_y']) / 2,
        (gt_extent['min_z'] + gt_extent['max_z']) / 2
    ])

    center_error = np.linalg.norm(pred_center - gt_center)

    return {
        'volume_error': volume_error,
        'center_error': center_error,
        'combined_error': volume_error + center_error
    }

# Compare predicted vs GT for first wall
gt_lang_seq = LanguageSequence.load_from_file(f"{scene_path}/ase_scene_language.txt")

# Find first wall in both sequences
pred_wall = next(e for e in lang_seq.entities if e.COMMAND_STRING == "make_wall")
gt_wall = next(e for e in gt_lang_seq.entities if e.COMMAND_STRING == "make_wall")

error = compute_entity_geometric_error(pred_wall, gt_wall)
print("Entity-Level Error Example (Predicted vs GT):")
print(f"  Volume Error: {error['volume_error']:.4f}")
print(f"  Center Error: {error['center_error']:.4f} m")
print(f"  Combined Error: {error['combined_error']:.4f}")

## Saving and Loading Language Sequences

SceneScript language sequences can be saved to files and loaded later:

In [None]:
# Generate language string and save
language_string = lang_seq.generate_language_string()

# Save to file
output_path = ROOT_DIR / ".data/predicted_scene_language.txt"
with open(output_path, 'w') as f:
    f.write(language_string)

print(f"Saved predicted scene language to: {output_path}")
print(f"\nFirst 5 lines of output:")
print('\n'.join(language_string.split('\n')[:5]))