# SpatialLM Inference Notebook
This notebook breaks down the inference.py workflow into interactive cells.

In [1]:
# Cell 1: Imports and Constants
import os
import sys
import glob
import json
from pathlib import Path

import torch
import numpy as np
from tqdm import tqdm
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextIteratorStreamer, set_seed

# Add project root to path
PROJ_PATH = "/mnt/cluster/workspaces/jinjingxu/proj/vlm/SpatialLM"
DATA_ROOT = '/data/horse/ws/jixu233b-metadata_ws/datasets/arkitscenes-spatiallm/'
DATA_ROOT = '/mnt/nct-zfs/TCO-All/SharedDatasets/arkitscenes-spatiallm/'

PROJECT_ROOT = Path(PROJ_PATH)
sys.path.insert(0, str(PROJECT_ROOT))

from spatiallm import Layout
from spatiallm.pcd import load_o3d_pcd, get_points_and_colors, cleanup_pcd, Compose

DETECT_TYPE_PROMPT = {
    "all": "Detect walls, doors, windows, boxes.",
    "arch": "Detect walls, doors, windows.",
    "object": "Detect boxes.",
}

print("✓ Imports loaded")

  from .autonotebook import tqdm as notebook_tqdm


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
✓ Imports loaded


In [2]:
# Cell 2: Define Helper Functions

def preprocess_point_cloud(points, colors, grid_size, num_bins):
    """Preprocess point cloud into tensor features."""
    transform = Compose([
        dict(type="PositiveShift"),
        dict(type="NormalizeColor"),
        dict(
            type="GridSample",
            grid_size=grid_size,
            hash_type="fnv",
            mode="test",
            keys=("coord", "color"),
            return_grid_coord=True,
            max_grid_coord=num_bins,
        ),
    ])
    point_cloud = transform({
        "name": "pcd",
        "coord": points.copy(),
        "color": colors.copy(),
    })
    coord = point_cloud["grid_coord"]
    xyz = point_cloud["coord"]
    rgb = point_cloud["color"]
    point_cloud = np.concatenate([coord, xyz, rgb], axis=1)
    return torch.as_tensor(np.stack([point_cloud], axis=0))


def generate_layout(model, point_cloud, tokenizer, code_template_file, 
                    top_k=10, top_p=0.95, temperature=0.6, num_beams=1,
                    seed=-1, max_new_tokens=4096, detect_type="all", categories=[]):
    """Generate layout from point cloud using the model."""
    if seed >= 0:
        set_seed(seed)
    
    with open(code_template_file, "r") as f:
        code_template = f.read()
    
    task_prompt = DETECT_TYPE_PROMPT[detect_type]
    if detect_type != "arch" and categories:
        task_prompt = task_prompt.replace("boxes", ", ".join(categories))
    print("Task prompt:", task_prompt)
    
    prompt = f"<|point_start|><|point_pad|><|point_end|>{task_prompt} The reference code is as followed: {code_template}"
    
    if model.config.model_type == "spatiallm_qwen":
        conversation = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    else:
        conversation = [{"role": "user", "content": prompt}]
    
    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        {"input_ids": input_ids, "point_clouds": point_cloud},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    print("Generating layout...\\n")
    generate_texts = []
    for text in streamer:
        generate_texts.append(text)
        print(text, end="", flush=True)
    print("\\nDone!")
    
    layout_str = "".join(generate_texts)
    layout = Layout(layout_str)
    layout.undiscretize_and_unnormalize(num_bins=model.config.point_config["num_bins"])
    return layout

print("✓ Helper functions defined")

✓ Helper functions defined


In [3]:
# Cell 3: Set Parameters
DATA_ROOT = Path(DATA_ROOT)

# Input/Output paths
point_cloud_path = DATA_ROOT / "pcd/40753679.ply"
output_path = PROJECT_ROOT / "outputs/scene40753679.txt"

# Model configuration
model_path = "manycore-research/SpatialLM1.1-Qwen-0.5B"
disable_flash_attn = True  # Set to True for V100/older GPUs, False for A100+
VLM_PE = None  # None: standard 1D RoPE (default), "CCA_2DProj": Concentric Causal Attention with 2D projection

# Inference parameters
detect_type = "all"
categories = []
code_template_file = str(PROJECT_ROOT / "code_template.txt")

# Generation parameters
top_k = 10
top_p = 0.95
temperature = 0.6
num_beams = 1
seed = -1
max_new_tokens = 4096
inference_dtype = "bfloat16"
no_cleanup = False

# Optional: JSON file to filter specific samples
json_file = None

print("✓ Parameters configured")

✓ Parameters configured


In [4]:
# Cell 4: Diagnostic - Check CUDA Status
import torch
import sys

print("=== CUDA Diagnostic ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"CUDA device count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device capability: {torch.cuda.get_device_capability(0)}")

print(f"\nPython executable: {sys.executable}")
print(f"Python version: {sys.version.split()[0]}")

# Try to initialize CUDA explicitly
try:
    torch.cuda.init()
    print("\n✓ CUDA initialization successful")
except Exception as e:
    print(f"\n✗ CUDA initialization failed: {e}")

# Try to create a small tensor on CUDA
try:
    test_tensor = torch.zeros(1).cuda()
    print(f"✓ Test tensor created on CUDA: {test_tensor.device}")
    del test_tensor
except Exception as e:
    print(f"✗ Failed to create tensor on CUDA: {e}")

=== CUDA Diagnostic ===
PyTorch version: 2.4.1+cu124
CUDA available: True
CUDA version: 12.4
CUDA device count: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 2080 Ti
Device capability: (7, 5)

Python executable: /mnt/cluster/environments/jinjingxu/pkg/envs/spatiallm/bin/python
Python version: 3.11.14

✓ CUDA initialization successful
✓ Test tensor created on CUDA: cuda:0


In [None]:
# Cell 4: Load Model and Tokenizer
print(f"Loading model from {model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Check if we need to modify config
if disable_flash_attn or VLM_PE is not None:
    from transformers import AutoConfig
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    # Disable flash attention if requested
    if disable_flash_attn:
        if hasattr(config, 'point_config'):
            config.point_config['enable_flash'] = False
    # Set VLM_PE if specified
    if VLM_PE is not None:
        config.VLM_PE = VLM_PE
    model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        config=config,
        torch_dtype=getattr(torch, inference_dtype)
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=getattr(torch, inference_dtype)
    )

model.to("cuda")
model.set_point_backbone_dtype(torch.float32)
model.eval()

num_bins = model.config.point_config["num_bins"]

print(f"✓ Model loaded")
print(f"  - Type: {model.config.model_type}")
print(f"  - Bins: {num_bins}")
print(f"  - Device: {model.device}")
print(f"  - FlashAttention: {not disable_flash_attn}")
print(f"  - VLM_PE: {VLM_PE if VLM_PE is not None else 'None (standard RoPE)'}")

Loading model from manycore-research/SpatialLM1.1-Qwen-0.5B...


  _TORCH_CUSTOM_FWD = amp.custom_fwd(cast_inputs=torch.float16)
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD
  @_TORCH_CUSTOM_BWD


There are 2 PE in Sonata Model:
Enc stage:
Enable flash attention: False. Enable RPE: False.
Apply APE in SerializedAttention
Input_proj stage:
Enable Fourier Encode: True.
✓ Model loaded
  - Type: spatiallm_qwen
  - Bins: 1280
  - Device: cuda:0
  - FlashAttention: False
  - VLM_PE: None (standard RoPE)


In [6]:
print('Pointcloud path: ', point_cloud_path)
# Cell 5: Discover Point Cloud Files
if os.path.isfile(point_cloud_path):
    point_cloud_files = [point_cloud_path]
else:
    if json_file is not None:
        print(f"Loading samples from JSON: {json_file}")
        with open(json_file, "r") as f:
            json_data = json.load(f)
        
        point_cloud_files = []
        for item in json_data:
            pcd_files = []
            if isinstance(item, str):
                pcd_files = [item]
            elif isinstance(item, dict):
                value = (item.get("point_clouds") or item.get("point_cloud") or 
                        item.get("file_name") or item.get("pcd_file") or 
                        item.get("scene_id") or item.get("id"))
                pcd_files = value if isinstance(value, list) else [value] if value else []
            
            for pcd_file in pcd_files:
                pcd_file = pcd_file.replace("pcd/", "")
                if not pcd_file.endswith(".ply"):
                    pcd_file = f"{pcd_file}.ply"
                
                full_path = os.path.join(point_cloud_path, pcd_file)
                if os.path.exists(full_path):
                    point_cloud_files.append(full_path)
                else:
                    print(f"Warning: Not found: {full_path}")
        print(f"Found {len(point_cloud_files)} samples from JSON")
    else:
        point_cloud_files = glob.glob(os.path.join(point_cloud_path, "*.ply"))

print(f"✓ Found {len(point_cloud_files)} file(s)")
for i, f in enumerate(point_cloud_files[:5]):
    print(f"  [{i+1}] {os.path.basename(f)}")
if len(point_cloud_files) > 5:
    print(f"  ... +{len(point_cloud_files)-5} more")

Pointcloud path:  /mnt/nct-zfs/TCO-All/SharedDatasets/arkitscenes-spatiallm/pcd/40753679.ply
✓ Found 1 file(s)
  [1] 40753679.ply


In [7]:
# Cell 6: Load and Preprocess Point Cloud
current_file_idx = 0
point_cloud_file = point_cloud_files[current_file_idx]
print(f"Processing: {os.path.basename(point_cloud_file)}")

point_cloud = load_o3d_pcd(str(point_cloud_file))
grid_size = Layout.get_grid_size(num_bins)

if not no_cleanup:
    point_cloud = cleanup_pcd(point_cloud, voxel_size=grid_size)

points, colors = get_points_and_colors(point_cloud)
min_extent = np.min(points, axis=0)

print(f"✓ Point cloud loaded")
print(f"  - Points: {len(points)}")
print(f"  - Range: {np.min(points, axis=0)} to {np.max(points, axis=0)}")
print(f"  - Grid: {grid_size}")

input_pcd = preprocess_point_cloud(points, colors, grid_size, num_bins)
print(f"✓ Preprocessed: {input_pcd.shape}")

Processing: 40753679.ply
✓ Point cloud loaded
  - Points: 148586
  - Range: [-0.80251148 -3.29999995 -1.80055124] to [5.89757013 2.07263744 1.30499995]
  - Grid: 0.025
✓ Preprocessed: torch.Size([1, 106328, 9])


In [8]:
# Cell 7: Generate Layout
layout = generate_layout(
    model, input_pcd, tokenizer, code_template_file,
    top_k=top_k, top_p=top_p, temperature=temperature,
    num_beams=num_beams, seed=seed, detect_type=detect_type,
    categories=categories
)

layout.translate(min_extent)
print(f"\\n✓ Layout generated")

Task prompt: Detect walls, doors, windows, boxes.
Generating layout...\n


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


wall_0=Wall(203,0,10,228,0,10,140,0)
wall_1=Wall(203,0,10,185,6,10,140,0)
wall_2=Wall(228,0,10,228,75,10,140,0)
wall_3=Wall(185,6,10,178,10,10,140,0)
wall_4=Wall(178,10,10,169,15,10,140,0)
wall_5=Wall(169,15,10,160,21,10,140,0)
wall_6=Wall(160,21,10,153,28,10,140,0)
wall_7=Wall(153,28,10,144,34,10,140,0)
wall_8=Wall(144,34,10,135,40,10,140,0)
wall_9=Wall(135,40,10,124,45,10,140,0)
wall_10=Wall(124,45,10,116,49,10,140,0)
wall_11=Wall(116,49,10,102,56,10,140,0)
wall_12=Wall(102,56,10,93,60,10,140,0)
wall_13=Wall(93,60,10,87,65,10,140,0)
wall_14=Wall(87,65,10,77,73,10,140,0)
wall_15=Wall(77,73,10,66,83,10,140,0)
wall_16=Wall(180,75,10,228,75,10,140,0)
wall_17=Wall(180,75,10,180,87,10,140,0)
wall_18=Wall(66,83,10,58,91,10,140,0)
wall_19=Wall(180,87,10,174,96,10,140,0)
wall_20=Wall(58,91,10,51,99,10,140,0)
wall_21=Wall(174,96,10,167,104,10,140,0)
wall_22=Wall(51,99,10,45,105,10,140,0)
wall_23=Wall(167,104,10,160,111,10,140,0)
wall_24=Wall(45,105,10,37,111,10,140,0)
wall_25=Wall(37,111,10,28

In [9]:
# Cell 8: Save Output
pred_language_string = layout.to_language_string()

if os.path.splitext(output_path)[-1]:
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w") as f:
        f.write(pred_language_string)
    print(f"✓ Saved to: {output_path}")
else:
    output_filename = os.path.basename(point_cloud_file).replace(".ply", ".txt")
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, output_filename)
    with open(output_file, "w") as f:
        f.write(pred_language_string)
    print(f"✓ Saved to: {output_file}")

print("\\n--- Preview (500 chars) ---")
print(pred_language_string[:500])
if len(pred_language_string) > 500:
    print(f"... +{len(pred_language_string)-500} chars")

✓ Saved to: /mnt/cluster/workspaces/jinjingxu/proj/vlm/SpatialLM/outputs/scene40753679.txt
\n--- Preview (500 chars) ---
wall_0=Wall(4.272488516569138,-3.299999952316284,-1.5505512356758118,4.897488516569138,-3.299999952316284,-1.5505512356758118,2.8000000000000003,0.0)
wall_1=Wall(4.272488516569138,-3.299999952316284,-1.5505512356758118,3.8224885165691376,-3.1499999523162843,-1.5505512356758118,2.8000000000000003,0.0)
wall_2=Wall(4.897488516569138,-3.299999952316284,-1.5505512356758118,4.897488516569138,-1.4249999523162842,-1.5505512356758118,2.8000000000000003,0.0)
wall_3=Wall(3.8224885165691376,-3.1499999523162
... +16851 chars


## Optional: Batch Processing
Uncomment the cell below to process all files in batch mode.

In [10]:
# Cell 9: Batch Process All Files (Optional)
# Uncomment to run batch processing

# for point_cloud_file in tqdm(point_cloud_files):
#     point_cloud = load_o3d_pcd(str(point_cloud_file))
#     grid_size = Layout.get_grid_size(num_bins)
#     if not no_cleanup:
#         point_cloud = cleanup_pcd(point_cloud, voxel_size=grid_size)
#     points, colors = get_points_and_colors(point_cloud)
#     min_extent = np.min(points, axis=0)
#     input_pcd = preprocess_point_cloud(points, colors, grid_size, num_bins)
#     layout = generate_layout(model, input_pcd, tokenizer, code_template_file,
#                              top_k=top_k, top_p=top_p, temperature=temperature,
#                              num_beams=num_beams, seed=seed, detect_type=detect_type,
#                              categories=categories)
#     layout.translate(min_extent)
#     pred_language_string = layout.to_language_string()
#     if os.path.splitext(output_path)[-1]:
#         with open(output_path, "w") as f:
#             f.write(pred_language_string)
#     else:
#         output_filename = os.path.basename(point_cloud_file).replace(".ply", ".txt")
#         os.makedirs(output_path, exist_ok=True)
#         with open(os.path.join(output_path, output_filename), "w") as f:
#             f.write(pred_language_string)
# print(f"✓ Batch complete: {len(point_cloud_files)} files")