In [1]:
import sys
import os
from copy import deepcopy
sys.path.append("./Grounded_Segment_Anything/recognize-anything")
sys.path.append("./Grounded_Segment_Anything/GroundingDINO")
sys.path.append("./Grounded_Segment_Anything/segment_anything")

import numpy as np
import torch
import cv2
import matplotlib.pyplot as plt
# import pykinect_azure as pykinect

# Grounding DINO
import Grounded_Segment_Anything.GroundingDINO.groundingdino.datasets.transforms as T
from Grounded_Segment_Anything.GroundingDINO.groundingdino.models import build_model
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.slconfig import SLConfig
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.utils import (
    clean_state_dict, 
    get_phrases_from_posmap
)

# Segment Anything
from Grounded_Segment_Anything.segment_anything.segment_anything import (
    sam_model_registry,
    SamPredictor
)

# Recognize Anything Model & Tag2Text
from ram.models import ram
from ram import inference_ram
import torchvision.transforms as TS

from collections import defaultdict
from PIL import Image

from projections import PointProjector
from aggregator import PointCloudAggregator

GROUNDING_DINO_CONFIG = "Grounded_Segment_Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "Grounded_Segment_Anything/groundingdino_swint_ogc.pth"
SAM_CHECKPOINT = "Grounded_Segment_Anything/sam_vit_h_4b8939.pth"
RAM_CHECKPOINT = "Grounded_Segment_Anything/ram_swin_large_14m.pth"
BOX_THRESHOLD = 0.50
TEXT_THRESHOLD = 0.50
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BERT_BASE_UNCASED_PATH = None

# device_config = pykinect.default_configuration
# device_config.color_format = pykinect.K4A_IMAGE_FORMAT_COLOR_BGRA32
# device_config.color_resolution = pykinect.K4A_COLOR_RESOLUTION_720P
# device_config.depth_mode = pykinect.K4A_DEPTH_MODE_NFOV_2X2BINNED

# pykinect.initialize_libraries()

  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
def get_rotation_translation(filepath) -> dict[np.ndarray]: ## filename -> transform
    world_transforms = dict()
    
    skip_header = True
    with open(filepath, 'r') as file:
        for line in file:
            if skip_header:
                skip_header = False
                continue

            data = line.split()[1:] # ignore timestamp
            filename = data.pop() + ".png"
            data = [float(p) for p in data]
            rigid_transform = quaternion_to_rigid_transform(*data)
            
            ## first do E, then try E inverse
            E = np.eye(4)
            E[:3] = rigid_transform
            world_transforms[filename] = E

    return world_transforms

def quaternion_to_rigid_transform(x, y, z, qx, qy, qz, qw) -> np.ndarray:
    # Normalize the quaternion
    norm = np.sqrt(qx**2 + qy**2 + qz**2 + qw**2)
    qx, qy, qz, qw = qx / norm, qy / norm, qz / norm, qw / norm

    # Compute the rotation matrix
    E_inv = np.zeros((3, 4))
    R = np.array([
        [1 - 2*(qy**2 + qz**2), 2*(qx*qy - qz*qw),     2*(qx*qz + qy*qw)],
        [2*(qx*qy + qz*qw),     1 - 2*(qx**2 + qz**2), 2*(qy*qz - qx*qw)],
        [2*(qx*qz - qy*qw),     2*(qy*qz + qx*qw),     1 - 2*(qx**2 + qy**2)]
    ])

    E_inv[:3, :3] = R
    E_inv[:, 3] = np.array([x, y, z])
    return E_inv

def prepare_image(image: np.ndarray):
    transform = T.Compose([
        T.RandomResize([800], max_size=1333),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    image_pil = Image.fromarray(image)
    image_tensor, _ = transform(image_pil, None)  # 3, h, w
    return image_pil, image_tensor

def load_grounding_dino(model_config_path, model_checkpoint_path, bert_base_uncased_path, device):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    args.bert_base_uncased_path = bert_base_uncased_path

    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print(load_res)

    model.eval()
    return model

def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
    caption = caption.lower().strip()
    if not caption.endswith("."):
        caption += "."

    model = model.to(device)
    image = image.to(device)

    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
        logits = outputs["pred_logits"].cpu().sigmoid()[0]
        boxes = outputs["pred_boxes"].cpu()[0]

    filt_mask = logits.max(dim=1)[0] > box_threshold
    logits_filt = logits[filt_mask]
    boxes_filt = boxes[filt_mask]

    tokenized = model.tokenizer(caption)
    pred_phrases = [
        get_phrases_from_posmap(logit > text_threshold, tokenized, model.tokenizer) +
        (f"({str(logit.max().item())[:4]})" if with_logits else "")
        for logit, _ in zip(logits_filt, boxes_filt)
    ]

    return boxes_filt, pred_phrases

def get_ram_output(model, image_pil):
    normalize = TS.Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
    transform = TS.Compose([
                    TS.Resize((384, 384)),
                    TS.ToTensor(), normalize
                ])    
    
    raw_image = image_pil.resize((384, 384))
    raw_image  = transform(raw_image).unsqueeze(0).to(DEVICE)
    res = inference_ram(raw_image, model)
    tags = res[0].replace(' |', '.')
    return tags

gd_model = load_grounding_dino(GROUNDING_DINO_CONFIG, GROUNDING_DINO_CHECKPOINT, BERT_BASE_UNCASED_PATH, device=DEVICE)
sam_model = SamPredictor(sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT).to(DEVICE))
ram_model = ram(pretrained=RAM_CHECKPOINT,
                image_size=384,
                vit='swin_l',
                threshold=0.68).to(DEVICE)
ram_model.eval()

final text_encoder_type: bert-base-uncased
_IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])


BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


/encoder/layer/0/crossattention/self/query is tied
/encoder/layer/0/crossattention/self/key is tied
/encoder/layer/0/crossattention/self/value is tied
/encoder/layer/0/crossattention/output/dense is tied
/encoder/layer/0/crossattention/output/LayerNorm is tied
/encoder/layer/0/intermediate/dense is tied
/encoder/layer/0/output/dense is tied
/encoder/layer/0/output/LayerNorm is tied
/encoder/layer/1/crossattention/self/query is tied
/encoder/layer/1/crossattention/self/key is tied
/encoder/layer/1/crossattention/self/value is tied
/encoder/layer/1/crossattention/output/dense is tied
/encoder/layer/1/crossattention/output/LayerNorm is tied
/encoder/layer/1/intermediate/dense is tied
/encoder/layer/1/output/dense is tied
/encoder/layer/1/output/LayerNorm is tied
--------------
Grounded_Segment_Anything/ram_swin_large_14m.pth
--------------
load checkpoint from Grounded_Segment_Anything/ram_swin_large_14m.pth
vit: swin_l


RAM(
  (visual_encoder): SwinTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 192, kernel_size=(4, 4), stride=(4, 4))
      (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): BasicLayer(
        dim=192, input_resolution=(96, 96), depth=2
        (blocks): ModuleList(
          (0): SwinTransformerBlock(
            dim=192, input_resolution=(96, 96), num_heads=6, window_size=12, shift_size=0, mlp_ratio=4.0
            (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
            (attn): WindowAttention(
              dim=192, window_size=(12, 12), num_heads=6
              (qkv): Linear(in_features=192, out_features=576, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
              (proj): Linear(in_features=192, out_features=192, bias=True)
              (proj_drop): Dropout(p=0.0, inplace=False)
              (softmax): Softmax(dim=-

In [3]:
## must have /rgb, /depth directories and /poses.txt file
DIRECTORY = "/home/bwilab/Documents/RTAB-Map/kitchen"
# DIRECTORY = "/home/bwilab/Semantic-Mapping-BWI/red_chair"
object_tags = "box. cardboard box. electronic. equipment. office supply. package."

pointclouds = defaultdict(list) # Object label --> list[PointCloud]
projector = PointProjector()
aggregator = PointCloudAggregator(eps=0.50)

all_pointclouds = []

world_transforms = get_rotation_translation(f"{DIRECTORY}/poses.txt")
file_basenames = [os.path.basename(file) for file in os.listdir(f"{DIRECTORY}/rgb")]
file_basenames.sort(key=lambda x: int(x[:-4]))
rgb_images = [f"{DIRECTORY}/rgb/{file}" for file in file_basenames]
depth_images = [f"{DIRECTORY}/depth/{file}" for file in file_basenames]
assert(len(rgb_images) == len(depth_images) == len(world_transforms))

# rgb_images = rgb_images[:1]
# depth_images = depth_images[:1]

# fig, axs = plt.subplots(2, 2, figsize=(16, 8))
# axs[0, 0].set_axis_off(); axs[0, 1].set_axis_off(); axs[1, 0].set_axis_off(); axs[1, 1].set_axis_off()
# def show_box(box, ax, label):
#     x0, y0 = box[0], box[1]
#     w, h = box[2] - box[0], box[3] - box[1]
#     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
#     ax.text(x0, y0, label)

for frame_idx, (rgb_path, depth_path) in enumerate(zip(rgb_images, depth_images)):
    print(f"Processing frame {frame_idx+1}/{len(rgb_images)}")
    scene = defaultdict(list) # Object label --> list[masks]
    transform = world_transforms[os.path.basename(rgb_path)]

    ## Make sure frames match
    assert os.path.basename(rgb_path) == os.path.basename(depth_path)

    with Image.open(rgb_path) as color_image, Image.open(depth_path) as depth_image:
        ## Feed through RAM
        object_tags = get_ram_output(ram_model, color_image)
        object_tags += " floor."

        ## Make sure images are same dims
        color_image, depth_image = np.array(color_image), np.array(depth_image)
        resized_color_image = cv2.resize(color_image, depth_image.shape[::-1])
    
    ## Feed through DINO
    image_pil, image_tensor = prepare_image(resized_color_image)
    boxes, pred_phrases = get_grounding_output(
        gd_model, image_tensor, object_tags, BOX_THRESHOLD, TEXT_THRESHOLD, device=DEVICE
    )

    ## Prepare SAM
    if torch.numel(boxes) == 0: # nothing found in frame
        continue
    
    sam_model.set_image(resized_color_image)
    W, H = image_pil.size
    for i in range(boxes.size(0)):
        boxes[i] *= torch.Tensor([W, H, W, H])
        boxes[i][:2] -= boxes[i][2:] / 2
        boxes[i][2:] += boxes[i][:2]

    ## SAM outputs
    transformed_boxes = sam_model.transform.apply_boxes_torch(boxes, resized_color_image.shape[:2]).to(DEVICE)
    masks, _, _ = sam_model.predict_torch(
        point_coords=None, point_labels=None, boxes=transformed_boxes.to(DEVICE), multimask_output=False
    )

    ## Associate outputs
        ## How to use
        # color_pixels = color_image * mask[:, :, None]
        # depth_pixels = depth_image * mask
    for mask, box, label in zip(masks, boxes, pred_phrases):
        mask = mask[0].cpu().numpy()
        x0, y0 = box[0], box[1]
        label = label[:label.index('(')] # remove confidence
        scene[label].append(mask)

    all_masks = np.zeros_like(resized_color_image)
    ## Generating pointclouds
    for label in scene:
        for mask in scene[label]:
            masked_depth_image = depth_image * mask
            masked_color_image = resized_color_image * mask[:, :, None]
            all_masks |= mask[:, :, None]
            # plt.imshow(masked_color_image)
            # plt.show()
            # plt.imshow(masked_depth_image)
            # plt.show()

            pcl = projector.get_pointcloud(masked_depth_image, masked_color_image, stride=3)
            pcl.clean(verbose=False)
            if pcl.is_empty():
                continue

            pcl.label = label
            pcl.transformation = transform
            pcl.transform(transform)
            instance = aggregator.nearest_pointcloud(pcl)
            aggregator.aggregate_pointcloud(pcl, instance)

    # axs[0, 0].imshow(resized_color_image) # camera input
    # axs[0, 1].imshow(depth_image)
    # axs[1, 0].imshow(resized_color_image)
    # for box, label in zip(boxes, pred_phrases):
    #     show_box(box, axs[1, 0], label)
    # axs[1, 1].imshow(resized_color_image * all_masks) # sam frame
    # plt.show()

# aggregator.refine_views()
# projector.visualize(aggregator.main)

# aggregator.main.transform_to_rtab().save("semantic_blue_chair.ply")

pred_phrases

Processing frame 1/52
Processing frame 2/52
Processing frame 3/52
Processing frame 4/52
Processing frame 5/52
Processing frame 6/52
Processing frame 7/52
Processing frame 8/52
Processing frame 9/52
Processing frame 10/52
Processing frame 11/52
Processing frame 12/52
Processing frame 13/52
Processing frame 14/52
Processing frame 15/52
Processing frame 16/52
Processing frame 17/52
Processing frame 18/52
Processing frame 19/52
Processing frame 20/52
Processing frame 21/52
Processing frame 22/52
Processing frame 23/52
Processing frame 24/52
Processing frame 25/52
Processing frame 26/52
Processing frame 27/52
Processing frame 28/52
Processing frame 29/52
Processing frame 30/52
Processing frame 31/52
Processing frame 32/52
Processing frame 33/52
Processing frame 34/52
Processing frame 35/52
Processing frame 36/52
Processing frame 37/52
Processing frame 38/52
Processing frame 39/52
Processing frame 40/52
Processing frame 41/52
Processing frame 42/52
Processing frame 43/52
Processing frame 44/

['microwave(0.78)']

In [4]:
# from copy import deepcopy

# agg = deepcopy(aggregator)
# agg.refine_views()
# projector.visualize(agg.scene['package'])

In [8]:

from pointcloud import PointCloud
import random
agg = deepcopy(aggregator)

for key in agg.scene:
    a = PointCloud()
    r,g,b = random.random(), random.random(), random.random()
    for pcl in agg.scene[key]:
        random_color = np.array([[r,g,b] for i in range(len(pcl))])
        pcl.colors = random_color
        a += pcl
    a.save(f"kitchen/{key}.ply")

projector.visualize(a)

In [6]:
# agg = deepcopy(aggregator)

# for instance in agg._scene['package']:
#     for view in instance:
#         random_color = np.random.uniform(size=(3,))
#         for pcl in view.pointclouds:
#             pcl._pcl.paint_uniform_color(random_color)

# all_views = []
# for instance in agg._scene['package']:
#     for view in instance:
#         all_views += [view.get_pointcloud()]
            

# projector.visualize(all_views)

In [7]:
# from pointcloud import PointCloud
# agg = deepcopy(aggregator)

# all_pointclouds = []
# for label in agg._scene:
#     for instance in agg._scene[label]:
#         new_instance = PointCloud()
#         random_color = np.random.uniform(size=(3,))

#         for view in instance:
#             new_instance += view.get_pointcloud()

#         new_instance._pcl.paint_uniform_color(random_color)
#         all_pointclouds += [new_instance]

# projector.visualize(all_pointclouds)