In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt
import open3d as o3d
import cv2

import json

current_directory = os.getcwd()

## Load ScanNet scene

In [2]:
scan = "scene0000_00"

scan_path = os.path.join(current_directory, "data/scans", scan)
scene_mesh_path = os.path.join(scan_path, scan + "_vh_clean_2.ply")
scene_mesh = o3d.io.read_triangle_mesh(scene_mesh_path)
segmentation_mesh_path = os.path.join(scan_path, scan + "_vh_clean_2.labels.ply")
segmentation_mesh = o3d.io.read_triangle_mesh(segmentation_mesh_path)

# o3d.visualization.draw_geometries([scene_mesh])

## Select a class

The class must be chosen among one of the raw_category in the [scannetv2-labels.combinesd.tsv](data/scannetv2-labels.combined.tsv)

In [3]:
with open(os.path.join(scan_path, scan + '_vh_clean.aggregation.json')) as f:
    aggregation_data = json.load(f)

all_classes = set()
for seg_group in aggregation_data['segGroups']:
    all_classes.add(seg_group['label'])

print("All labels in the aggregation data: ", all_classes)

All labels in the aggregation data:  {'kitchen cabinets', 'bicycle', 'kitchen counter', 'curtain', 'clock', 'tissue box', 'cabinet', 'window', 'toilet', 'floor', 'trash can', 'table', 'sink', 'toaster', 'dish rack', 'shelf', 'toaster oven', 'bed', 'pillow', 'couch', 'mirror', 'laundry basket', 'guitar case', 'stool', 'backpack', 'desk', 'ceiling', 'door', 'wall', 'microwave', 'nightstand', 'coffee table', 'object', 'doorframe', 'scale', 'refrigerator', 'guitar', 'shoes', 'tv', 'shower'}


In [4]:
class_name = 'bed'

with open(os.path.join(scan_path, scan + '_vh_clean.aggregation.json')) as f:
    aggregation_data = json.load(f)

object_ids = []
for seg_group in aggregation_data['segGroups']:
    if seg_group['label'] == class_name:
        object_ids.append(seg_group['objectId'])

if not object_ids:
    print("No objects found for class:", class_name)
else:
    print(f"Found {len(object_ids)} objects for class {class_name}: {object_ids}")


Found 1 objects for class bed: [37]


## Select an object

In [5]:
object_id = 32
for seg_group in aggregation_data['segGroups']:
    if seg_group['objectId'] == object_id:
        selected_instance_segments = seg_group['segments']

# Load all vertices
with open(os.path.join(scan_path, scan + '_vh_clean_2.0.010000.segs.json')) as f:
    segmentation_data = json.load(f)

# Load instance vertices
all_vertices = np.array(segmentation_data['segIndices'])

# Get indices of instance vertices
instance_vertices_mask = np.isin(all_vertices, selected_instance_segments)
instance_vertex_indices = np.where(instance_vertices_mask)[0]

# Filter faces of the mesh: included if all its vertices are part of the instance
faces = np.asarray(scene_mesh.triangles)
face_mask = np.all(np.isin(faces, instance_vertex_indices), axis=1)
instance_faces = faces[face_mask] # Triplets of vertex indices forming each triangle (the indices refer to scene vertices - all of them)
instance_vertices = np.asarray(scene_mesh.vertices)[instance_vertex_indices] # Coordinates of each vertex
vertex_remap = {scene_idx: instance_idx for instance_idx, scene_idx in enumerate(instance_vertex_indices)}
instance_faces = np.vectorize(vertex_remap.get)(instance_faces) # Triplets of vertex indices forming each triangle (the indices refer to instance vertices - masked)

# Create the mesh for the selected instance
instance_mesh = o3d.geometry.TriangleMesh()
instance_mesh.vertices = o3d.utility.Vector3dVector(instance_vertices)
instance_mesh.triangles = o3d.utility.Vector3iVector(instance_faces)

# o3d.visualization.draw_geometries([instance_mesh])

## Get camera parameters

### Export camera parameters
**Note:** This step should be performed only at the beginning of the process.

In [22]:
reader_directory = os.path.join(current_directory, "scripts/SensReader")
sens_file = os.path.join(scan_path, scan + ".sens")
output_directory = os.path.join(current_directory, "outputs/reader/"+scan)

os.system(f"python {os.path.join(reader_directory, 'reader.py')} --filename {sens_file} --output_path {output_directory} --export_color_images --export_poses --export_intrinsics")

Namespace(filename='/Users/lara/Desktop/Making-CLIP-features-multiview-consistent/data/scans/scene0000_00/scene0000_00.sens', output_path='/Users/lara/Desktop/Making-CLIP-features-multiview-consistent/outputs/reader/scene0000_00', export_depth_images=False, export_color_images=True, export_poses=True, export_intrinsics=True)
loading /Users/lara/Desktop/Making-CLIP-features-multiview-consistent/data/scans/scene0000_00/scene0000_00.sens...loaded!
exporting 5578 color frames to /Users/lara/Desktop/Making-CLIP-features-multiview-consistent/outputs/reader/scene0000_00/color


### Load camera parameters

In [6]:
output_directory = os.path.join(current_directory, "outputs/reader/"+scan)
pose_directory = os.path.join(output_directory, "pose")
originals_directory = os.path.join(output_directory, "color")
pose_files = [f for f in os.listdir(pose_directory) if f.endswith('.txt')]
frame_indices = [int(f.split('.')[0]) for f in pose_files]
intrinsics = np.loadtxt(os.path.join(output_directory, "intrinsic", "intrinsic_color.txt"))  # Camera intrinsics
extrinsics = np.loadtxt(os.path.join(output_directory, "intrinsic", "extrinsic_color.txt"))  # Camera estrinsic
camera_intrinsics = intrinsics[:3, :3]

## Select frames that contain the specific object

### Select minimum_vertices in an image
Select the percentage of the object you want in the frame

In [7]:
percentage = 60

total_vertices = instance_vertices.shape[0]
minimum_vertices = int(percentage / 100 * total_vertices)

## Get masked images

In [8]:
selected_frames = []
masked_images_directory = os.path.join(current_directory, "outputs/masked_images/project_with_occlusion/"+scan, f"{object_id}")
if not os.path.exists(masked_images_directory):
    os.makedirs(masked_images_directory)


def project_vertices(points_3d, intrinsic, extrinsic):
    points_3d_homogeneous = np.hstack([points_3d, np.ones((points_3d.shape[0], 1))])
    points_camera = np.dot(extrinsic, points_3d_homogeneous.T).T
    points_camera_xyz = points_camera[:, :3] # Extract only the x, y, z components because the intrinsic matrix is 3x3
    points_2d_homogeneous = np.dot(intrinsic, points_camera_xyz.T).T # the third value is the depth
    positive_depth_indices = points_2d_homogeneous[:, 2] > 0 # select only points in front of the camera
    points_2d_homogeneous_positive = points_2d_homogeneous[positive_depth_indices]
    points_2d = points_2d_homogeneous_positive[:, :2] / points_2d_homogeneous_positive[:, 2, np.newaxis]
    return points_2d

def draw_triangle(img, vertices, color):
    cv2.fillConvexPoly(img, vertices, color)

example_img = cv2.imread(os.path.join(originals_directory, "0.jpg")) # image used to get the image shape
image_height, image_width = example_img.shape[:2]

for frame_index in frame_indices:
    pose_path = os.path.join(pose_directory, f"{frame_index}.txt")
    original_path = os.path.join(originals_directory, f"{frame_index}.jpg")
    pose = np.loadtxt(pose_path)
    original = cv2.imread(original_path)
    camera_extrinsics = np.linalg.inv(pose)

    object_vertices_2d = project_vertices(np.asarray(instance_mesh.vertices), camera_intrinsics, camera_extrinsics)
    in_bounds_mask = (object_vertices_2d[:, 0] >= 0) & (object_vertices_2d[:, 0] < image_width) & (object_vertices_2d[:, 1] >= 0) & (object_vertices_2d[:, 1] < image_height)
    in_bounds_points_2d = object_vertices_2d[in_bounds_mask]

    if in_bounds_points_2d.shape[0] >= minimum_vertices:
        selected_frames.append(frame_index)
        binary_mask = np.zeros((image_height, image_width), dtype=np.uint8)
        for face in instance_mesh.triangles:
            pts_2d = np.array(object_vertices_2d[face], dtype=np.int32)
            draw_triangle(binary_mask, pts_2d, 1)

        masked_image = cv2.bitwise_and(original, original, mask=binary_mask)
        output_path = os.path.join(masked_images_directory, f"{frame_index:05d}.jpg")
        cv2.imwrite(output_path, masked_image)