In [None]:
import sys
sys.path.append("./Grounded_Segment_Anything/recognize-anything")
sys.path.append("./Grounded_Segment_Anything/GroundingDINO")
sys.path.append("./Grounded_Segment_Anything/segment_anything")

import numpy as np
import torch
import cv2
import matplotlib.pyplot as plt
import pykinect_azure as pykinect
import open3d as o3d

# Grounding DINO
import Grounded_Segment_Anything.GroundingDINO.groundingdino.datasets.transforms as T
from Grounded_Segment_Anything.GroundingDINO.groundingdino.models import build_model
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.slconfig import SLConfig
from Grounded_Segment_Anything.GroundingDINO.groundingdino.util.utils import (
    clean_state_dict, 
    get_phrases_from_posmap
)

# Segment Anything
from Grounded_Segment_Anything.segment_anything.segment_anything import (
    sam_model_registry,
    SamPredictor
)

from collections import defaultdict
from PIL import Image

from pointcloud import PointCloud
from projections import PointProjector

GROUNDING_DINO_CONFIG = "Grounded_Segment_Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "Grounded_Segment_Anything/groundingdino_swint_ogc.pth"
SAM_CHECKPOINT = "Grounded_Segment_Anything/sam_vit_h_4b8939.pth"
BOX_THRESHOLD = 0.3
TEXT_THRESHOLD = 0.25
DEVICE = "cuda"
BERT_BASE_UNCASED_PATH = None

device_config = pykinect.default_configuration
device_config.color_format = pykinect.K4A_IMAGE_FORMAT_COLOR_BGRA32
device_config.color_resolution = pykinect.K4A_COLOR_RESOLUTION_720P
device_config.depth_mode = pykinect.K4A_DEPTH_MODE_NFOV_2X2BINNED

pykinect.initialize_libraries()

In [None]:
def prepare_image(image: np.ndarray):
    transform = T.Compose([
        T.RandomResize([800], max_size=1333),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    image_pil = Image.fromarray(image)
    image_tensor, _ = transform(image_pil, None)  # 3, h, w
    return image_pil, image_tensor
PointProjector
def load_model(model_config_path, model_checkpoint_path, bert_base_uncased_path, device):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    args.bert_base_uncased_path = bert_base_uncased_path

    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print(load_res)

    model.eval()
    return model

def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
    caption = caption.lower().strip()
    if not caption.endswith("."):
        caption += "."

    model = model.to(device)
    image = image.to(device)

    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
        logits = outputs["pred_logits"].cpu().sigmoid()[0]
        boxes = outputs["pred_boxes"].cpu()[0]

    filt_mask = logits.max(dim=1)[0] > box_threshold
    logits_filt = logits[filt_mask]
    boxes_filt = boxes[filt_mask]

    tokenized = model.tokenizer(caption)
    pred_phrases = [
        get_phrases_from_posmap(logit > text_threshold, tokenized, model.tokenizer) +
        (f"({str(logit.max().item())[:4]})" if with_logits else "")
        for logit, _ in zip(logits_filt, boxes_filt)
    ]

    return boxes_filt, pred_phrases

gd_model = load_model(GROUNDING_DINO_CONFIG, GROUNDING_DINO_CHECKPOINT, BERT_BASE_UNCASED_PATH, device=DEVICE)
sam_model = SamPredictor(sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT).to(DEVICE))

In [None]:
## Read from file or start the kinect

pcl = o3d.io.read_point_cloud("/home/bwilab/Documents/RTAB-Map/cloud.ply")
projector = PointProjector()
projector.visualize([pcl])

In [None]:
import os
from PIL import Image
from projections import PointProjector

# Define the input and output directories
input_dir = '/home/bwilab/Documents/RTAB-Map/depth'
projector = PointProjector()
clouds = []
data = [
[0.001661, -0.049135, 0.998791, -0.000001, -0.999684, -0.025142, 0.000426, 0.000001, 0.025091, -0.998476, -0.049161, -0.000000],
[0.389109, -0.043268, 0.920175, 0.045199, -0.920552, -0.055499, 0.386659, 0.176321, 0.034339, -0.997521, -0.061426, 0.007353],
[0.015137, -0.056920, 0.998264, 0.013789, -0.999328, -0.034199, 0.013204, 0.047429, 0.033388, -0.997793, -0.057400, 0.008325],
[0.129009, -0.055052, 0.990114, 0.058669, -0.991420, -0.028353, 0.127603, 0.163157, 0.021048, -0.998081, -0.058237, 0.006341],
[0.226645, -0.069160, 0.971519, 0.173093, -0.973976, -0.017683, 0.225960, 0.132562, 0.001552, -0.997449, -0.071368, 0.014187],
[0.403715, -0.033656, 0.914266, 0.098407, -0.913214, -0.075204, 0.400481, 0.251978, 0.055278, -0.996600, -0.061097, 0.016438],
[0.002224, -0.082195, 0.996614, 0.144454, -0.999988, -0.004525, 0.001858, 0.172828, 0.004357, -0.996606, -0.082204, 0.005021],
[-0.254581, -0.076549, 0.964017, 0.083994, -0.967008, 0.010701, -0.254521, 0.071858, 0.009168, -0.997008, -0.076748, 0.010039],
[0.073594, -0.056498, 0.995687, 0.109433, -0.997238, -0.014163, 0.072905, 0.147367, 0.009983, -0.998302, -0.057385, 0.003789],
[0.440923, -0.043506, 0.896490, 0.107163, -0.897471, -0.034212, 0.439745, 0.174608, 0.011539, -0.998467, -0.054130, 0.002940],
[-0.033278, -0.051933, 0.998096, 0.115863, -0.999323, -0.013913, -0.034043, 0.136181, 0.015654, -0.998554, -0.051435, -0.001568],
[-0.220165, -0.049985, 0.974181, 0.123005, -0.975336, -0.004797, -0.220672, 0.135384, 0.015703, -0.998739, -0.047696, -0.004768],
[0.467569, -0.035847, 0.883230, 0.131093, -0.883835, -0.035513, 0.466448, 0.168315, 0.014646, -0.998726, -0.048288, -0.007282],
[0.722379, -0.028645, 0.690904, 0.119363, -0.691421, -0.044809, 0.721062, 0.190804, 0.010304, -0.998585, -0.052175, 0.000190],
[0.380533, -0.054347, 0.923169, 0.124438, -0.924681, -0.036031, 0.379035, 0.165557, 0.012663, -0.997872, -0.063965, 0.001036],
[0.153136, -0.057516, 0.986530, 0.110670, -0.988030, -0.027716, 0.151753, 0.151439, 0.018614, -0.997960, -0.061072, 0.005018],
[0.056568, -0.056403, 0.996804, 0.148776, -0.998116, -0.026938, 0.055118, 0.008453, 0.023743, -0.998045, -0.057820, 0.001576],
[0.254737, -0.122301, 0.959245, 0.096420, -0.966702, -0.007158, 0.255805, 0.172555, -0.024419, -0.992467, -0.120052, 0.020674],
[0.149935, -0.057511, 0.987022, 0.132506, -0.988063, -0.044437, 0.147504, 0.105818, 0.035377, -0.997355, -0.063488, 0.012614],
[0.247276, -0.076684, 0.965906, 0.114974, -0.958391, -0.166092, 0.232166, 0.196955, 0.142625, -0.983124, -0.114564, 0.038493],
[-0.207483, -0.082063, 0.974791, 0.078762, -0.977319, 0.060591, -0.202920, 0.013570, -0.042411, -0.994784, -0.092774, 0.026266],
[0.046672, -0.064937, 0.996797, 0.110554, -0.998136, -0.042315, 0.043978, 0.122518, 0.039324, -0.996992, -0.066791, 0.005940],
[0.146626, -0.057324, 0.987530, 0.103216, -0.989156, -0.017000, 0.145881, 0.113458, 0.008425, -0.998211, -0.059195, 0.001042],
[0.277778, -0.123365, 0.952691, 0.093634, -0.958214, -0.106095, 0.265650, 0.216045, 0.068304, -0.986674, -0.147681, 0.037613],
[0.056913, -0.083691, 0.994865, 0.125929, -0.995107, -0.085364, 0.049746, 0.115777, 0.080762, -0.992829, -0.088140, 0.061468]
]
# R00 R01 R02 Tx R10 R11 R12 Ty R20 R21 R22 Tz
transform = lambda p: np.array([
    [p[0], p[1], p[2]],
    [p[4], p[5], p[6]], 
    [p[8], p[9], p[10]], 
    ])
rotations = [transform(p) for p in data]
translations = [np.array([p[3], p[7], p[11]]) for p in data]
idx = 0

print(rotations[0])
print(translations[0])

# def process_image(input_path):
#     global clouds
#     global idx
#     """
#     Process an image by converting it to grayscale and resizing it.

#     Args:
#         input_path (str): The path to the input image.
#         output_path (str): The path where the processed image will be saved.
#     """
#     try:
#         # Open the image file
#         with Image.open(input_path) as img:
#             img = np.array(img)
#             transformed = (projector.get_pointcloud(img).points @ rotations[idx].T) + translations[idx]
#             clouds += [PointCloud(transformed)]
#             idx += 1

#     except Exception as e:
#         print(f"Error processing {input_path}: {e}")

# def is_image_file(filename):
#     """
#     Check if a file is an image based on its extension.

#     Args:
#         filename (str): The name of the file.

#     Returns:
#         bool: True if the file is an image, False otherwise.
#     """
#     image_extensions = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')
#     return filename.lower().endswith(image_extensions)

# def main():
#     # Get a list of all files in the input directory
#     for filename in os.listdir(input_dir):
#         input_path = os.path.join(input_dir, filename)

#         # Skip directories and non-image files
#         if not os.path.isfile(input_path):
#             print(f"Skipping directory: {filename}")
#             continue
#         if not is_image_file(filename):
#             print(f"Skipping non-image file: {filename}")
#             continue

#         process_image(input_path)
#     projector.visualize(clouds)

# if __name__ == '__main__':
#     main()


In [None]:
projector = PointProjector()
paths = ['/home/bwilab/Documents/RTAB-Map/depth/1.png', '/home/bwilab/Documents/RTAB-Map/depth/28.png']
clouds = []
idx = 0
for path in paths:
    with Image.open(path) as img:
        img = np.array(img)
        img = projector.undistort_image(img)
        transformed = (projector.get_pointcloud(img).points @ rotations[idx].T) + translations[idx]
        idx += 19
        clouds += [PointCloud(transformed)]

# clouds[0].save('d1_transformed.ply')
clouds[1].save('d28_transformed.ply')


In [None]:
kinect = pykinect.start_device(config=device_config)

In [None]:
scene = defaultdict(list) # Object label --> list[mask]
pointclouds = defaultdict(list) # Object label --> list[PointCloud]
projector = PointProjector()

OBJECT_LABELS = "box."

while True:
    capture = kinect.update()
    ret_color, color_image = capture.get_color_image()  # (720, 1280, 4)
    ret_depth, depth_image = capture.get_depth_image()
    if not (ret_color and ret_depth):
        continue

    ## Some preprocessing
    color_image = cv2.cvtColor(color_image, cv2.COLOR_BGR2RGB)
    # undistort_color_image = projector.undistort_image(color_image)
    resized_color_image = cv2.resize(color_image, (depth_image.shape[1], depth_image.shape[0]))
    
    ## Feed through DINO
    image_pil, image_tensor = prepare_image(resized_color_image)
    boxes, pred_phrases = get_grounding_output(
        gd_model, image_tensor, OBJECT_LABELS, BOX_THRESHOLD, TEXT_THRESHOLD, device=DEVICE
    )

    sam_model.set_image(resized_color_image[:, :, :3])

    # Adjust boxes for SAM
    W, H = image_pil.size
    for i in range(boxes.size(0)):
        boxes[i] *= torch.Tensor([W, H, W, H])
        boxes[i][:2] -= boxes[i][2:] / 2
        boxes[i][2:] += boxes[i][:2]

    transformed_boxes = sam_model.transform.apply_boxes_torch(boxes, resized_color_image.shape[:2]).to(DEVICE)
    masks, _, _ = sam_model.predict_torch(
        point_coords=None, point_labels=None, boxes=transformed_boxes.to(DEVICE), multimask_output=False
    )

    ## Associate outputs of sam and grounding dino
    for mask, box, label in zip(masks, boxes, pred_phrases):
        mask = mask[0].cpu().numpy()
        x0, y0 = box[0], box[1]
        label = label[:label.index('(')] # remove confidence
        scene[label].append(mask)

        ## How to use
        # color_pixels = color_image * mask[:, :, None]
        # depth_pixels = depth_image * mask[:, :]

    for obj in scene:
        for mask in scene[obj]:
            masked_depth_image = depth_image * mask[:, :]
            pcl = projector.get_pointcloud(masked_depth_image)
            pcl.label = obj
            projector.clean_pointcloud(pcl)
            pointclouds[obj].append(pcl)

    ## Show masked images
    pcl_mask = scene['box'][0]
    masked_color = resized_color_image * pcl_mask[:, :, None]
    masked_depths = depth_image * pcl_mask
    fig, axes = plt.subplots(1, 3, figsize=(10,5))
    axes[0].axis('off'); axes[1].axis('off'); axes[2].axis('off')
    axes[0].imshow(masked_color); axes[1].imshow(masked_depths); axes[2].imshow(resized_color_image)
    plt.show()

    projector.visualize(pointclouds['box'][0])

In [None]:
## YOU MUST RUN THIS CELL
kinect.close()
del kinect