# Single Estimation

This notebook shows the full pipeline for estimating 6D pose of an object given 
RGB image $\mathcal{I}$ and matched depth map $\mathcal{D}$ and Object segmentation mask $\mathcal{M}$.

Major TODOS are as follows
* [ ] How we can pass a single reference image with its depth and segmentation mask and recover pose?
* [ ] Why .obj file was required and can we recover the same from a .ply point cloud?
* [ ]  

To check how this pointcloud looks like, use Cloud compare
```/usr/bin/CloudCompare```

In [1]:
# Imports
from matplotlib import pyplot as plt
import os
from pathlib import Path
# import numpy as np
import torch
# import torch.utils.model_zoo
import math

# LatentFusion
from latentfusion.recon.inference import Observation
from latentfusion.datasets.realsense import RealsenseDataset
import latentfusion.visualization as viz
from latentfusion.augment import gan_denormalize
from latentfusion import meshutils
from latentfusion import augment
from latentfusion.recon.inference import LatentFusionModel
from latentfusion.three.orientation import evenly_distributed_quats
import latentfusion.pose.estimation as pe



In [2]:
# Setup global environment
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
MOPED_PATH = Path('datasets/moped')
num_ref_views = 8 # How many reference images to use?

Using device: cuda:0


In [3]:
# Load pre-trained model
checkpoint = torch.load('weights/latentfusion-release.pth', weights_only=False)
model = LatentFusionModel.from_checkpoint(checkpoint, device)

[2m2024-10-03 22:59.39[0m [[32m[1minfo     [0m] [1mloaded model                  [0m [[0m[1m[34mlatentfusion.recon.inference[0m][0m [36mepoch[0m=[35m200[0m [36mname[0m=[35mshapenet,no_mask_morph,fixed_eqlr,256,mask,depth,in_mask,mask_noise_p=0.25,sm=nearest,fuser=gru-branched_20200509_10h19m10s-branched_20200509_10h42m53s-branched_20200509_10h46m53s-branched_20200509_10h48m49s[0m


In [5]:
object_id = 'toy_plane' # Name of the object
frame_idx = 20

# Define path variables
# TODO check if we can get away without using the ground truth
input_scene_dir = MOPED_PATH / object_id / 'reference' # Ground-truth
target_scene_dir = MOPED_PATH / object_id / 'evaluation' # Input

# Why are we using object's point cloud?
pointcloud_path = input_scene_dir / 'integrated_registered_processed.obj'
obj = meshutils.Object3D(pointcloud_path)
pointcloud = torch.tensor(obj.vertices, dtype=torch.float32) # May not be used anywhere?
diameter = obj.bounding_diameter
object_scale = 1.0 / diameter
object_scale_to_meters = 1.0 / object_scale

print(f"diameter: {diameter}")
print(f"object_scale: {object_scale}")
print(f"object_scale_to_meters: {object_scale_to_meters}")

diameter: 0.3425245885532891
object_scale: 2.919498434327504
object_scale_to_meters: 0.3425245885532891


In [6]:
# TODO find out why we need both .obj file and how it was made?
# TODO the function of RealsenseDataset class??
# Make listof paths
input_paths = [x for x in input_scene_dir.iterdir() if x.is_dir()]
#print(input_paths[:2]) # [PosixPath('datasets/moped/toy_plane/reference/06')]
input_dataset = RealsenseDataset(input_paths,
                                 image_scale=1.0,
                                 object_scale=object_scale,
                                 odometry_type='open3d')

[2m2024-10-03 23:01.52[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/reference/02/registration/registration.json[0m
[2m2024-10-03 23:01.52[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/reference/02/registration/registration.json[0m
[2m2024-10-03 23:01.52[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/reference/06/registration/registration.json[0m
[2m2024-10-03 23:01.52[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/reference/06/registration/registration.json[0m
[2m2024-10-03 23:01.52[0m [[32m[1minfo  

In [7]:
target_paths = sorted([x for x in target_scene_dir.iterdir() if x.is_dir()])
target_dataset = RealsenseDataset(target_paths,
                                  image_scale=1.0,
                                  object_scale=object_scale, # Can we just use 1.0 here??
                                  odometry_type='open3d',
                                  use_registration=True)
                                  

[2m2024-10-03 23:01.55[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/evaluation/00/registration/registration.json[0m
[2m2024-10-03 23:01.55[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/evaluation/00/registration/registration.json[0m
[2m2024-10-03 23:01.55[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/evaluation/01/registration/registration.json[0m
[2m2024-10-03 23:01.55[0m [[32m[1minfo     [0m] [1musing registration            [0m [[0m[1m[34mlatentfusion.datasets.realsense[0m][0m [36mpath[0m=[35mdatasets/moped/toy_plane/evaluation/01/registration/registration.json[0m
[2m2024-10-03 23:01.55[0m [[32m[1min

In [8]:
# Batch input and target observations
# TODO what is Observation.from_dataset doing?
input_obs = Observation.from_dataset(input_dataset, inds=input_dataset.sample_evenly(num_ref_views))
target_obs = Observation.from_dataset(target_dataset, inds=list(range(len(target_dataset)))[frame_idx:frame_idx+1])



In [9]:
# Preprocess observations
# TODO what does model.preprocess_observation do?
input_obs_pp = model.preprocess_observation(input_obs)
input_obs_pp_gt = model.preprocess_observation(input_obs)
target_obs_pp = model.preprocess_observation(target_obs)

  grids = bboxes_to_grid(boxes, in_size, out_size)


## Step 1: Create latent representation of the new object.

In [10]:
with torch.no_grad():
    z_obj = model.build_latent_object(input_obs_pp)

    # Visualize prediction.
    camera = input_obs_pp.camera.clone()
    y, z = model.render_latent_object(z_obj, camera.to(device))

# This is the reconstruction error. But for completely unseen objects, we will not be able to do this.
recon_error = (y['depth'].detach().cpu() - input_obs_pp_gt.depth).abs()
print('recon_error', recon_error.mean().item())

recon_error 0.043014854192733765


## Step 2: For given reference, find coarse estimation

## Step 3: For the coarse estimates, obtain fine estimation