In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataclasses import dataclass, field
from PIL import Image
import torch
from torch import nn
import torch.nn.functional as F
import os
import json
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches

from typing import List

import requests

np.set_printoptions(suppress=True)
# %matplotlib ipympl

In [3]:
from mmdet.apis import init_detector, inference_detector, show_result_pyplot
from torchvision.models import resnet18, ResNet18_Weights

config_file = '../../Projects/mmdetection/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py'
checkpoint_file = '../../Projects/mmdetection/checkpoints/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth'

# build the model from a config file and a checkpoint file
model = init_detector(config_file, checkpoint_file, device='cuda:0')
# model_ = init_detector(config_file, checkpoint_file, device='cuda:0')

weights = ResNet18_Weights.IMAGENET1K_V1
resnet = resnet18(weights=weights)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet = resnet.eval().cuda()
preprocess = weights.transforms()



load checkpoint from local path: ../../Projects/mmdetection/checkpoints/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth


In [None]:
folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_16'
# folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_6'
# folder_dir = './data/solo/'

errs = []
preds, gts = [], []
for i in range(20):
    f_dir = f'{folder_dir}/sequence.{i}'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    captures = data['captures']

    

    break

In [None]:
@dataclass
class Line:
    origin: np.ndarray
    direction: np.ndarray

    @classmethod
    def from_camera_coor(cls, origin, coor):
        return cls(np.array(origin), np.array(coor))
    
    def point(self, r):
        return self.origin + r*self.direction


def find_points(line_a: Line, line_b: Line):
    d = line_b.origin - line_a.origin
    u = np.dot(line_a.direction, line_b.direction)
    e = np.dot(line_a.direction, d)
    f = np.dot(line_b.direction, d)

    r_1 = (e - u*f) / (1 - u**2)
    r_2 = (f - u*e) / (u**2 - 1)

    p1 = line_a.point(r_1)
    p2 = line_b.point(r_2)

    # print(p1, p2)
    return (p1 + p2) / 2
    

@dataclass
class Camera:
    filename: str=''
    position: np.ndarray=np.array([0., 0., 0.])
    quaternion: np.ndarray=np.array([1., 0., 0., 0.])  # angle in quaternion

    resolution: np.ndarray=np.array([3840, 2160])
    sensor_size: np.ndarray=np.array([30, 30])
    matrix: np.ndarray=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
    # intrinsic: np.ndarray=np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
    
    @staticmethod
    def qm_2(quaternion, vector):
        q0, q1, q2, q3 = quaternion
        matrix = np.array([
            [1-2*q2**2-2*q3**2, 2*(q1*q2+q0*q3), 2*(q1*q3-q0*q2)],
            [2*(q1*q2-q0*q3), 1-2*q1**2-2*q3**2, 2*(q2*q3+q0*q1)],
            [2*(q1*q3+q0*q2), 2*(q2*q3-q0*q1), 1-2*q1**2-2*q2**2]
        ])

        return np.matmul(vector, matrix.T)

    @staticmethod
    def _convert_to_ndarray(obj):
        if not isinstance(obj, (np.ndarray, np.generic)):
            return np.array(obj)
        return obj

    def __post_init__(self):
        self.position = self._convert_to_ndarray(self.position)
        self.quaternion = self._convert_to_ndarray(self.quaternion)
        self.resolution = self._convert_to_ndarray(self.resolution)
        self.sensor_size = self._convert_to_ndarray(self.sensor_size)  # measured in millimeters
        self.matrix = self._convert_to_ndarray(self.matrix).reshape((3, 3))

        focal = 20.78461  # measured in millimeters
        self.intrinsic = np.array([
            [focal*self.resolution[0]/self.sensor_size[0], 0, 0],
            [0, -focal*self.resolution[0]/self.sensor_size[1], 0],
            [self.resolution[0]/2, self.resolution[1]/2, 1],
        ])
    
    def world_to_camera(self, world_coor: List[np.ndarray]):
        world_coor = [coor - self.position for coor in world_coor]  # translation
        # rotation
        camera_coor = [
            self.qm_2(self.quaternion, coor)
            for coor in world_coor
        ]
        return camera_coor

    def camera_to_pixel(self, camera_coor: List[np.ndarray]) -> List[np.ndarray]:
        ndc_coor = [np.matmul(coor, self.intrinsic) for coor in camera_coor]
        pixel_coor = [np.floor((coor/coor[-1])[:-1]) for coor in ndc_coor]

        return pixel_coor

    def pixel_to_ray(self, pixel_coor):

        inv_intrinsic = np.linalg.inv(self.intrinsic)
        ndc_coor = [
            np.matmul(np.array([*coor, 1]), inv_intrinsic)
            for coor in pixel_coor
        ]

        q_ = self.quaternion
        camera_coor = [
            self.qm_2(np.array([q_[0], -q_[1], -q_[2], -q_[3]]), coor)
            for coor in ndc_coor
        ]

        return [Line.from_camera_coor(self.position, coor) for coor in camera_coor]
    
    @classmethod
    def from_capture(cls, f_dir, capture):
        q_ = capture['rotation']
        return cls(
            filename=os.path.join(f_dir, capture['filename']),
            position=capture['position'],
            quaternion=np.array([q_[3], q_[0], q_[1], q_[2]]),
            resolution=capture['dimension'],
            matrix=capture['matrix'],
        )
    
@dataclass
class BoundingBox:
    instanceId: str
    center: np.ndarray
    size: np.ndarray
    
    @classmethod
    def from_anno(cls, info):
        size = np.array([dim/2 for dim in info['dimension']])
        return cls(
            instanceId=info['instanceId'],
            center=np.array(info['origin']) + size,
            size=size
        )


@dataclass
class Location:
    origin: np.ndarray

    @classmethod
    def from_info(cls, info, rotation, offset):
        ret = Camera.qm_2(
            rotation,
            np.array(info['translation'])
        )
        return cls(ret + offset)


@dataclass
class DeepCamera(Camera):
    object_detector: torch.nn.Module=None
    resnet: torch.nn.Module=None
    preprocess: torch.nn.Module=None

    gt_bboxes: List[BoundingBox]=field(default_factory=list)
    location: Location=None

    pred_bboxes: List[BoundingBox]=field(init=False)

    def __post_init__(self):
        super().__post_init__()
        result = inference_detector(self.object_detector, self.filename)
        
        for i in range(len(result[1])):
            if len(result[1][i]) > 0:
                for j in range(len(result[1][i])):
                    result[1][i][j] = np.zeros_like(result[1][i][j], dtype=bool)
        
        image = Image.open(self.filename).convert('RGB')
        human_pred = result[0][0]
        self.preds = []
        # for pred in human_pred:
        for pred, pred_ in zip(human_pred, self.gt_bboxes):
            if pred[-1] < 0.9:
                continue
            bbox = BoundingBox(
                instanceId=-1,
                center=np.array([(pred[0] + pred[2])/2, (pred[1] + pred[3])/2]),
                size=np.array([(pred[2]-pred[0])/2, (pred[3]-pred[1])/2])
            )
            tl, br = pred_.center - pred_.size, pred_.center + pred_.size
            image_ = image.crop((*tl, *br))
            image_tensor = torch.tensor(np.array(image_), dtype=torch.float32).permute((2, 0, 1)).unsqueeze(0)
            
            image_tensor = self.preprocess(image_tensor)
            with torch.no_grad():
                box_feature = self.resnet(image_tensor.cuda())
                print(box_feature.shape)
            # print(box_feature[0][:20])

            self.preds.append(
                (bbox, pred[-1], box_feature[0].cpu(), self.pixel_to_ray([np.array([(pred[0] + pred[2])/2, (pred[1] + pred[3])/2])])[0])
            )
        print('-'*30)
            
    @classmethod
    def from_capture(cls, f_dir, capture, object_detector, resnet, preprocess):
        camera = Camera.from_capture(f_dir, capture)
        annotations = [anno['values'] for anno in capture['annotations'] if '2D' in anno['id']][0]
        # anno_3d = [(anno['instanceId'], Location.from_info(anno, rotation, offset)) for anno in anno_3d if obj_name in anno['labelName']]
        return cls(
            filename=camera.filename,
            position=camera.position,
            quaternion=camera.quaternion,
            resolution=camera.resolution,
            sensor_size=camera.sensor_size,
            matrix=camera.matrix,
            object_detector=object_detector,
            resnet=resnet, preprocess=preprocess,
            gt_bboxes=[BoundingBox.from_anno(anno) for anno in annotations]
        )


class Scene:
    offset = np.array([0.00956252, 0, -0.068264])

    def __init__(self, f_dir, cameras: List[DeepCamera], locations: List[Location]):
        self.f_dir = f_dir
        self.cameras = cameras
        self.object_pairs = self._object_pairing()

    def _object_pairing(self):
        camera_0 = self.cameras[0].preds
        camera_1 = self.cameras[1].preds

        features_1 = torch.stack([pred[2] for pred in camera_1])
        # print(features_1[:, :10])

        for i, pred in enumerate(camera_0):
            feature = pred[2]
            # print(feature[:10])
            similarity = F.cosine_similarity(feature.unsqueeze(0), features_1)
            # print(similarity)
            print(i, 'and', similarity.argmax(0))

        for camera in self.cameras:
            img = Image.open(camera.filename)
            for pred in camera.preds:
                bbox = pred[0]
                tl, br = bbox.center - bbox.size, bbox.center + bbox.size
                crop = img.crop((*tl, *br))
                plt.imshow(crop)
                plt.axis('off')
                plt.show()

            print('-'*30)

        return None

    @classmethod
    def from_captures(cls, f_dir, captures, object_detector, resnet, preprocess):
        cameras = [DeepCamera.from_capture(f_dir, capture, object_detector, resnet, preprocess) for capture in captures]
        
        return cls(
            f_dir=f_dir,
            cameras=cameras,
            locations=[]
        )
    
    def _show_bbox(self, bboxes):
        fig, ax = plt.subplots()
        ax.imshow(mpimg.imread(self.filename))
        for bbox in bboxes:
            ax.add_patch(
                patches.Rectangle(
                    bbox.center - bbox.size,
                    2*bbox.size[0], 2*bbox.size[1],
                    linewidth=1, edgecolor='r', facecolor='none'
                )
            )
        plt.axis('off')
        plt.show()
    
    def show_gt_bboxes(self):
        for camera in self.cameras:
            self._show_bbox(camera.gt_bboxes)
    
    def show_pred_bboxes(self):
        for camera in self.cameras:
            self._show_bbox([pred[0] for pred in camera.preds])


In [None]:
folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_16'
# folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_6'
# folder_dir = './data/solo/'

errs = []
preds, gts = [], []
for i in range(20):
    f_dir = f'{folder_dir}/sequence.{i}'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    captures = data['captures']

    scene = Scene.from_captures(f_dir, captures, model, resnet, preprocess)
    # scene.show_gt_bboxes()
    # scene.show_pred_bboxes()
    # scene._object_pairing()

    # break

In [None]:


    rays = []
    colors = ['red', 'purple']
    # fig = plt.figure(figsize=(8, 8))
    # ax = fig.add_subplot(111, projection='3d')
    for j, camera_info in enumerate(cameras):
        # print('camera info', camera_info['position'], camera_info['rotation'])
        q_ = camera_info['rotation']
        camera = Camera(
            position=camera_info['position'],
            quaternion=np.array([q_[3], q_[0], q_[1], q_[2]]),
            # quaternion=np.array([q_[0], q_[1], q_[2], q_[3]]),
            resolution=camera_info['dimension'],
            matrix=camera_info['matrix'],
        )

        # bbox_2d = _find(camera_info['annotations'], 'bounding box')
        bbox_2d = _find(camera_info['annotations'], '2D')
        center, size = bbox_2d['origin'], np.array([bbox/2 for bbox in bbox_2d['dimension']])
        ray = camera.pixel_to_ray([center+size])
        rays.append(ray[0])

        # ax.quiver(
        #     camera.position[0],
        #     camera.position[1],
        #     camera.position[2],
        #     ray[0].direction[0],
        #     ray[0].direction[1],
        #     ray[0].direction[2],
        #     color='teal', length=1.0, arrow_length_ratio=.1, normalize=True
        # )
        bbox_3d = _find(camera_info['annotations'], '3D')
        # print(bbox_3d['translation'])

        # bbox_3d = _find(camera_info['annotations'], 'bounding box 3D')
        bbox_3d = _find(camera_info['annotations'], '3D')
        # the bbox_3d['translation'] is the coordinate relative to the camera.
        # Therefore, we need to convert it to world coordinate
        # also, -0.2 in height as it is floating
        q_ = camera.quaternion
        ret = Camera.qm_2(
            np.array([q_[0], -q_[1], -q_[2], -q_[3]]),
            np.array(bbox_3d['translation'])
        ) + camera.position + offset
        # print(ret)
        ret[1] = 1.01557275
    # ret = np.array(bbox_3d['translation'])
    # print('size', camera_info['size'])
    # print(ret)
    # get prediction
    pred = find_points(*rays) #- np.array(offset)
    err = np.sqrt(((pred - ret)**2).sum())
    errs.append(err)
    print(f'pred: {pred}, gt: {ret} {err:.4f}')
    preds.append(pred)
    gts.append(ret)

    # # plot
    # ax.scatter(*pred, color='red')
    # ax.scatter(*ret, color='purple')
    # plt.show()

    # reproject the predicted location to each image
    for j, camera_info in enumerate(cameras):
        q_ = camera_info['rotation']
        camera = Camera(
            position=camera_info['position'],
            quaternion=np.array([q_[3], q_[0], q_[1], q_[2]]),
            resolution=camera_info['dimension'],
            matrix=camera_info['matrix'],
        )
        # bbox_2d = _find(camera_info['annotations'], 'bounding box')
        bbox_2d = _find(camera_info['annotations'], '2D')
        center, size = bbox_2d['origin'], np.array([bbox/2 for bbox in bbox_2d['dimension']])
        # center += size
        camera_coor = camera.world_to_camera([ret])
        pixel_coor = camera.camera_to_pixel(camera_coor)
        print(center+size, pixel_coor[0], abs((center+size) - pixel_coor[0]).sum())

    #     fig, ax = plt.subplots()
    #     ax.imshow(mpimg.imread(f'{f_dir}/{camera_info["filename"]}'))
    #     ax.add_patch(
    #         patches.Rectangle(
    #             center,
    #             2*size[0], 2*size[1],
    #             linewidth=1, edgecolor='r', facecolor='none'
    #         )
    #     )
    #     plt.axis('off')
    #     plt.plot(
    #         pixel_coor[0][0], pixel_coor[0][1],
    #         "o", markersize=5, markeredgecolor="purple", markerfacecolor="purple"
    #     )
    #     plt.show()

    print('-'*30)
    break
print(np.mean(errs))

In [None]:
one = np.array([2.99043751, 1.01557281, -2.931736])
two = np.array([-0.00956255, 1.01557269, 0.068264])

one_gt = np.array([3, 0., -3])
two_gt = np.array([0, 0., 0])
size = bbox_3d['size']
p = [one, two]
gt = [one_gt, two_gt]

offset = []
for a, b in zip(p, gt):
    offset.append(b-a)
    print(b-a)
    
print('-'*30)
print(np.var(offset, axis=0))
offset = np.mean(offset, axis=0)
print(offset)
print('-'*30)
for a, b in zip(p, gt):
    print(
        np.array(b) - np.array(offset) - np.array(a)
    )

In [None]:
size

In [None]:
size = bbox_3d['size']
errs = []
bins = []
for pred, gt in zip(preds, gts):
    err = np.sqrt((pred - gt)**2)
    print(err)
    errs.append(err)
    bins.append(err < size)
print('-'*30)
print(np.sum(bins, axis=0)/len(bins))
print(np.mean(errs, axis=0))

In [None]:
def _find(l, s):
    for elem in l:
        if elem['id'] == s or elem['id'] == f'{s}_0':
            values = elem['values']
            for i, value in enumerate(values):
                if value['labelName'] == 'Person':
                # if value['labelName'] == 'cleaning_snuggle_henkel':
                    break
            else:
                return None
            return values[i]


folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_11'

errs = []
preds, gts = [], []
for i in range(20):
    f_dir = f'{folder_dir}/sequence.{i}'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    cameras = data['captures']

    rays = []
    colors = ['red', 'purple']
    for j, camera_info in enumerate(cameras):
        # print('camera info', camera_info['position'], camera_info['rotation'])
        q_ = camera_info['rotation']
        camera = Camera(
            position=camera_info['position'],
            quaternion=np.array([q_[3], q_[0], q_[1], q_[2]]),
            resolution=camera_info['dimension'],
            matrix=camera_info['matrix'],
        )

    # bbox_3d = _find(camera_info['annotations'], 'bounding box 3D')
    bbox_3d = _find(camera_info['annotations'], '3D')
    q_ = camera.quaternion
    ret = Camera.qm_2(
        np.array([q_[0], -q_[1], -q_[2], -q_[3]]),
        np.array(bbox_3d['translation'])
    ) + camera.position - np.array([0, 0.1, 0])

    # reproject the predicted location to each image
    for j, camera_info in enumerate(cameras):
        q_ = camera_info['rotation']
        camera = Camera(
            position=camera_info['position'],
            quaternion=np.array([q_[3], q_[0], q_[1], q_[2]]),
            resolution=camera_info['dimension'],
            matrix=camera_info['matrix'],
        )
        # bbox_2d = _find(camera_info['annotations'], 'bounding box')
        bbox_2d = _find(camera_info['annotations'], '2D')
        center, size = bbox_2d['origin'], np.array([bbox/2 for bbox in bbox_2d['dimension']])
        # center += size
        camera_coor = camera.world_to_camera([ret])
        pixel_coor = camera.camera_to_pixel(camera_coor)
        print(
            center+size,
            pixel_coor[0],
            abs((center+size) - pixel_coor[0]).sum()
        )

    print('-'*30)
    break

In [None]:
bbox_3d['translation']