In [1]:
import sys
import os
sys.path.append("./Grounded_Segment_Anything/recognize-anything")
sys.path.append("./Grounded_Segment_Anything/GroundingDINO")
sys.path.append("./Grounded_Segment_Anything/segment_anything")

import numpy as np
import torch
import cv2
import matplotlib.pyplot as plt
# import pykinect_azure as pykinect
import open3d as o3d

# Grounding DINO
import Grounded_Segment_Anything.GroundingDINO.groundingdino.datasets.transforms as T
from Grounded_Segment_Anything.GroundingDINO.groundingdino.models import build_model
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.slconfig import SLConfig
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.utils import (
    clean_state_dict, 
    get_phrases_from_posmap
)

# Segment Anything
from Grounded_Segment_Anything.segment_anything.segment_anything import (
    sam_model_registry,
    SamPredictor
)

from collections import defaultdict
from PIL import Image

from pointcloud import PointCloud
from projections import PointProjector
from aggregator import PointCloudAggregator

GROUNDING_DINO_CONFIG = "Grounded_Segment_Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "Grounded_Segment_Anything/groundingdino_swint_ogc.pth"
SAM_CHECKPOINT = "Grounded_Segment_Anything/sam_vit_h_4b8939.pth"
BOX_THRESHOLD = 0.3
TEXT_THRESHOLD = 0.25
DEVICE = "cuda"
BERT_BASE_UNCASED_PATH = None

# device_config = pykinect.default_configuration
# device_config.color_format = pykinect.K4A_IMAGE_FORMAT_COLOR_BGRA32
# device_config.color_resolution = pykinect.K4A_COLOR_RESOLUTION_720P
# device_config.depth_mode = pykinect.K4A_DEPTH_MODE_NFOV_2X2BINNED

# pykinect.initialize_libraries()

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [14]:
def get_rotation_translation(filepath) -> list[np.ndarray]:
    extrinsics = []
    with open(filepath, 'r') as file:
        for data in file:
            E = np.array([float(p) for p in data.split()])
            E = E.reshape((3, 4))

            R = E[:3, :3]
            t = E[:, 3]

            E_inv = np.eye(4)
            E_inv[:3, :3] = R.T
            E_inv[:, 3][:3] = -R.T @ t

            print(E_inv)

            extrinsics += [E_inv]

            # extrinsic_matrix = np.eye(4, 4)
            # extrinsic_matrix[:3] = data
            # extrinsics += [extrinsic_matrix]

    return extrinsics

def prepare_image(image: np.ndarray):
    transform = T.Compose([
        T.RandomResize([800], max_size=1333),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    image_pil = Image.fromarray(image)
    image_tensor, _ = transform(image_pil, None)  # 3, h, w
    return image_pil, image_tensor

def load_model(model_config_path, model_checkpoint_path, bert_base_uncased_path, device):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    args.bert_base_uncased_path = bert_base_uncased_path

    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print(load_res)

    model.eval()
    return model

def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
    caption = caption.lower().strip()
    if not caption.endswith("."):
        caption += "."

    model = model.to(device)
    image = image.to(device)

    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
        logits = outputs["pred_logits"].cpu().sigmoid()[0]
        boxes = outputs["pred_boxes"].cpu()[0]

    filt_mask = logits.max(dim=1)[0] > box_threshold
    logits_filt = logits[filt_mask]
    boxes_filt = boxes[filt_mask]

    tokenized = model.tokenizer(caption)
    pred_phrases = [
        get_phrases_from_posmap(logit > text_threshold, tokenized, model.tokenizer) +
        (f"({str(logit.max().item())[:4]})" if with_logits else "")
        for logit, _ in zip(logits_filt, boxes_filt)
    ]

    return boxes_filt, pred_phrases

gd_model = load_model(GROUNDING_DINO_CONFIG, GROUNDING_DINO_CHECKPOINT, BERT_BASE_UNCASED_PATH, device=DEVICE)
sam_model = SamPredictor(sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT).to(DEVICE))

final text_encoder_type: bert-base-uncased


  checkpoint = torch.load(model_checkpoint_path, map_location="cpu")


_IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])


  state_dict = torch.load(f)


In [15]:
## must have /rgb, /depth directories and /poses.txt file

# to-do: 
# 1) better merging algorithm in aggregator.py
# 2) paint the pointcloud with provided method
# 3) project the pointcloud onto the thingy
FILEPATH = "/home/bwilab/Documents/RTAB-Map/red_chair"
OBJECT_LABELS = "couch."

all_masks = defaultdict(list) # Object label --> list[masks]
pointclouds = defaultdict(list) # Object label --> list[PointCloud]
projector = PointProjector()
aggregator = PointCloudAggregator(eps=2.0) ## higher eps == more merging, lower eps == more detail (or noise)

extrinsics = get_rotation_translation(f"{FILEPATH}/poses.txt")
rgb_images = [f"{FILEPATH}/rgb/{file}" for file in os.listdir(f"{FILEPATH}/rgb")]
depth_images = [f"{FILEPATH}/depth/{file}" for file in os.listdir(f"{FILEPATH}/depth")]
assert(len(rgb_images) == len(depth_images) == len(extrinsics))

rgb_images = rgb_images[:11]
depth_images = depth_images[:11]

for rgb_path, depth_path, transform in zip(rgb_images, depth_images, extrinsics):
    scene = defaultdict(list) # Object label --> list[masks]

    with Image.open(rgb_path) as color_image, Image.open(depth_path) as depth_image:
        ## Make sure images are same dims
        color_image, depth_image = np.array(color_image), np.array(depth_image)
        resized_color_image = cv2.resize(color_image, depth_image.shape[::-1])

    ## Feed through DINO
    image_pil, image_tensor = prepare_image(resized_color_image)
    boxes, pred_phrases = get_grounding_output(
        gd_model, image_tensor, OBJECT_LABELS, BOX_THRESHOLD, TEXT_THRESHOLD, device=DEVICE
    )

    # plt.imshow(color_image)
    # plt.imshow(depth_image)
    # plt.show()

    ## Prepare SAM
    if torch.numel(boxes) == 0:
        continue
    
    sam_model.set_image(resized_color_image)
    W, H = image_pil.size
    for i in range(boxes.size(0)):
        boxes[i] *= torch.Tensor([W, H, W, H])
        boxes[i][:2] -= boxes[i][2:] / 2
        boxes[i][2:] += boxes[i][:2]

    ## SAM outputs
    transformed_boxes = sam_model.transform.apply_boxes_torch(boxes, resized_color_image.shape[:2]).to(DEVICE)
    masks, _, _ = sam_model.predict_torch(
        point_coords=None, point_labels=None, boxes=transformed_boxes.to(DEVICE), multimask_output=False
    )

    ## Associate outputs
        ## How to use
        # color_pixels = color_image * mask[:, :, None]
        # depth_pixels = depth_image * mask
    for mask, box, label in zip(masks, boxes, pred_phrases):
        mask = mask[0].cpu().numpy()
        x0, y0 = box[0], box[1]
        label = label[:label.index('(')] # remove confidence
        scene[label].append(mask)
        all_masks[label].append(mask)

    ## Generating pointclouds
    for obj in scene:
        for mask in scene[obj]:
            masked_depth_image = depth_image * mask

            masked_color_image = resized_color_image * mask[:, :, None]
            # plt.imshow(masked_color_image)
            # plt.show()
            # plt.imshow(masked_depth_image)
            # plt.show()

            pcl = projector.get_pointcloud(masked_depth_image, stride=3)
            if pcl.is_empty():
                continue

            pcl.label = obj
            pcl.clean()

            #### TESTING 11/24 use inverse extrinsic to go from camera --> world
            pcl.transform(transform)
            
            ## TESTING 11/22
            # aggregator.add_unmerged_pointcloud(pcl)

            #### TESTING 11/24 if inverse extrinsic works use this
            #### and make a more robust icp algorithm (which pointcloud to merge onto?)
            # target = aggregator.nearest_pointcloud(pcl)
            # aggregator.aggregate_pointcloud(pcl, target, np.linalg.inv(transform))
            
            pointclouds[obj].append(pcl)
            # projector.visualize(pcl)

## TESTING: merge all instances
# aggregator.gather_pointclouds()
# aggregator.aggregate_all()
# projector.visualize(aggregator.main)

allpoints = []

for obj in pointclouds:
    for pcl in pointclouds[obj]:
        allpoints.append(pcl)

projector.visualize(allpoints)

## Show masked images
# pcl_mask = scene['box'][0]
# masked_color = resized_color_image * pcl_mask[:, :, None]
# masked_depths = depth_image * pcl_mask
# fig, axes = plt.subplots(1, 3, figsize=(10,5))
# axes[0].axis('off'); axes[1].axis('off'); axes[2].axis('off')
# axes[0].imshow(masked_color); axes[1].imshow(masked_depths); axes[2].imshow(resized_color_image)
# plt.show()

## Show pointclouds
# projector.visualize(pointclouds['chair'][0])


[[-1.648000e-03 -9.990590e-01 -4.334000e-02  1.954778e-06]
 [-6.739500e-02  4.335300e-02 -9.967840e-01 -1.083490e-06]
 [ 9.977250e-01  1.278000e-03 -6.740300e-02 -6.995900e-08]
 [ 0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00]]
[[ 0.526416   -0.847101   -0.072836    0.13048412]
 [-0.066283    0.044517   -0.996807   -0.0113764 ]
 [ 0.847639    0.529563   -0.032714   -0.09863631]
 [ 0.          0.          0.          1.        ]]
[[ 0.698105   -0.712707   -0.068543    0.13860583]
 [-0.055146    0.041926   -0.997598   -0.01915215]
 [ 0.713868    0.700208   -0.010035   -0.33847907]
 [ 0.          0.          0.          1.        ]]
[[-0.265475   -0.964105    0.004976    0.7543745 ]
 [-0.035886    0.004723   -0.999345   -0.16917264]
 [ 0.96345    -0.265479   -0.035852   -0.47914179]
 [ 0.          0.          0.          1.        ]]
[[ 0.140101   -0.990037   -0.014093    0.56493567]
 [-0.029133    0.010106   -0.999524   -0.18988755]
 [ 0.989709    0.140445   -0.027426   -0.76511

  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=False):


[DBSCAN] Found 1 clusters
[DBSCAN] Removing 0 noise points
[DBSCAN] Found 0 clusters
[DBSCAN] Removing 24 noise points
[DBSCAN] Found 2 clusters
[DBSCAN] Removing 13 noise points
[DBSCAN] Found 1 clusters
[DBSCAN] Removing 0 noise points
[DBSCAN] Found 4 clusters
[DBSCAN] Removing 11 noise points


In [None]:
# print('objects', aggregator._scene.keys())
# projector.visualize(aggregator._scene['couch'])

# res = PointCloud()
# res.label = "chair"
# for pcl in allpoints:
#     res += pcl

# projector.visualize(aggregator._unmerged_pointclouds['couch'])
aggregator.gather_pointclouds()
aggregator.aggregate_all()
projector.visualize(aggregator._scene['couch'])

Pointclouds left to merge: 4
cycle: 4
target [[ 0.600577 -0.027362  0.799098  0.654131]
 [-0.799205  0.009534  0.600983  0.743428]
 [-0.024063 -0.99958  -0.016142 -0.206915]
 [ 0.        0.        0.        1.      ]]
source [[ 0.698105 -0.055146  0.713868  0.143812]
 [-0.712707  0.041926  0.700208  0.336594]
 [-0.068543 -0.997598 -0.010035 -0.013002]
 [ 0.        0.        0.        1.      ]]
after :))))))))))
target [[ 0.600577 -0.027362  0.799098  0.654131]
 [-0.799205  0.009534  0.600983  0.743428]
 [-0.024063 -0.99958  -0.016142 -0.206915]
 [ 0.        0.        0.        1.      ]]
source [[-0.24242915  0.23713149 -0.94074197  4.26460232]
 [-0.97010473 -0.07044     0.23224017  0.89473362]
 [-0.01119422  0.96892034  0.24711899  1.48425943]
 [ 0.          0.          0.          1.        ]]
cycle: 2
target [[ 0.140101 -0.029133  0.989709  0.672557]
 [-0.990037  0.010106  0.140445  0.668682]
 [-0.014093 -0.999524 -0.027426 -0.20282 ]
 [ 0.        0.        0.        1.      ]]
sou