In [2]:
import os
import sys
import time
import json
import torch
import logging
from collections import defaultdict
%matplotlib inline
import datetime
import torch.nn.functional as F


import numpy as np
from nuscenes.nuscenes import NuScenes
from nuscenes.utils import splits

In [15]:
CAMERAS = ('CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT')
dic_jo = {'train': dict(X=[], Y=[], names=[], kps=[], boxes_3d=[], K=[], clst=defaultdict(lambda: defaultdict(list))),
          'val': dict(X=[], Y=[], names=[], kps=[], boxes_3d=[], K=[],clst=defaultdict(lambda: defaultdict(list))),
          'test': dict(X=[], Y=[], names=[], kps=[], boxes_3d=[], K=[], clst=defaultdict(lambda: defaultdict(list)))
          }
dic_names = defaultdict(lambda: defaultdict(list))

dir_nuscenes= '/data/bonnesoeur-data/data/nuscenes/' # Path to your nuscenes folder
dataset = "nuscenes_teaser"                          # Type of dataset to preprocess
dir_out = './json'                                   # Output directory

In [16]:
#! rm -rf $dir_out
#! mkdir $dir_out

In [17]:
def extract_from_token(sd_token):

        boxes_gt = []
        dds = []
        boxes_3d = []

        keypoints = []
        path_im, boxes_obj, kk = nusc.get_sample_data(sd_token, box_vis_level=1)  # At least one corner$

    
        kk = kk.tolist()
        name = os.path.basename(path_im)
        for box_obj in boxes_obj:
            if box_obj.name[:6] != 'animal':
                general_name = box_obj.name.split('.')[0] + '.' + box_obj.name.split('.')[1]
            else:
                general_name = 'animal'
            if general_name in select_categories('car'):
                 
                keypoint = project_2d(box_obj, kk)
                dd = np.linalg.norm(box_obj.center)
                bound = prepare_kps([keypoint])[0]
                box_gt = [min(bound[0]), min(bound[1]), max(bound[0]), max(bound[1])]
                
                dds.append(dd)
                
                
                box_3d = box_obj.center.tolist() + box_obj.wlh.tolist()
                boxes_3d.append(box_3d)
                keypoints.append(keypoint)   #Get the edges and the center of the box in the 2D coordinates
                boxes_gt.append(box_gt)
                
                dic_names[name]['boxes'].append(box_gt)
                dic_names[name]['dds'].append(dd)
                dic_names[name]['K'] = kk

        return name, boxes_gt, boxes_3d, dds, kk, keypoints

In [18]:
def process_keypoints(box):
    
    xc, yc, zc = box.center
    ww, ll, hh, = box.wlh
    x,y,z = box.corners().tolist()
    
    x.append(xc)
    y.append(yc)
    z.append(zc)

    return [x, y, z]

In [19]:
def project_2d(box, kk):
    """
    Project a 3D bounding box into the 2D image plane using the center and the corners of the box
    """
    box_2d = []
    xc, yc, zc = box.center
    ww, ll, hh, = box.wlh
    x,y,z = box.corners().tolist()
    
    x.append(xc)
    y.append(yc)
    z.append(zc)
    
    corners_3d = np.array([[a,b,c] for a,b,c in zip(x,y,z) ])

    # Project them and convert into pixel coordinates
    for xyz in corners_3d:
        xx, yy, zz = np.dot(kk, xyz)
        uu = xx / zz
        vv = yy / zz
        box_2d.append(0 if uu < 0 else 1599 if uu>=1600 else uu)
        box_2d.append(0 if vv < 0 else 899 if vv>=900 else vv)
    #print(box_2d)
    return box_2d

In [20]:
def append_cluster(dic_jo, phase, xx, dd, kps):
    """Append the annotation based on its distance"""

    if dd <= 10:
        dic_jo[phase]['clst']['10']['kps'].append(kps)
        dic_jo[phase]['clst']['10']['X'].append(xx)
        dic_jo[phase]['clst']['10']['Y'].append([dd])

   
        dic_jo[phase]['clst']['20']['kps'].append(kps)
        dic_jo[phase]['clst']['20']['X'].append(xx)
        dic_jo[phase]['clst']['20']['Y'].append([dd])

    elif dd <= 30:
        dic_jo[phase]['clst']['30']['kps'].append(kps)
        dic_jo[phase]['clst']['30']['X'].append(xx)
        dic_jo[phase]['clst']['30']['Y'].append([dd])

    else:
        dic_jo[phase]['clst']['>30']['kps'].append(kps)
        dic_jo[phase]['clst']['>30']['X'].append(xx)
        dic_jo[phase]['clst']['>30']['Y'].append([dd])

In [21]:
def select_categories(cat):
    """
    Choose the categories to extract annotations from
    """
    assert cat in ['person', 'all', 'car', 'cyclist']

    if cat == 'person':
        categories = ['human.pedestrian']
    elif cat == 'all':
        categories = ['human.pedestrian', 'vehicle.bicycle', 'vehicle.motorcycle']
    elif cat == 'cyclist':
        categories = ['vehicle.bicycle']
    elif cat == 'car':
        categories = ['vehicle.car', 'vehicle.truck']
    return categories

In [22]:
def run():
        """
        Prepare arrays for training
        """
        cnt_scenes = cnt_samples = cnt_sd = cnt_ann = 0
        start = time.time()
        for ii, scene in enumerate(scenes):
            end_scene = time.time()
            current_token = scene['first_sample_token']
            cnt_scenes += 1
            time_left = str((end_scene - start_scene) / 60 * (len(scenes) - ii))[:4] if ii != 0 else "NaN"

            sys.stdout.write('\r' + 'Elaborating scene {}, remaining time {} minutes'
                             .format(cnt_scenes, time_left) + '\t\n')
            start_scene = time.time()
            if scene['name'] in split_train:
                phase = 'train'
            elif scene['name'] in split_val:
                phase = 'val'
            else:
                print("phase name not in training or validation split")
                continue

            while not current_token == "":
                sample_dic = nusc.get('sample', current_token)
                cnt_samples += 1

                # Extract all the sample_data tokens for each sample
                for cam in CAMERAS:
                    sd_token = sample_dic['data'][cam]
                    cnt_sd += 1

                    # Extract all the annotations of the person
                    #? kk intrinsic camera parametersmatrix
                    name, boxes_gt, boxes_3d, dds, kk, keypoints = extract_from_token(sd_token)
                    
                    if keypoints:
                        keypoints = prepare_kps(keypoints) #Convert an tensor of 18 to a tensor of 3*9 (the last column is neglected)
                        inputs = preprocess_monoloco(keypoints, kk).tolist()
                        for box_gt, box_3d, keypoint, dd, my_input  in zip(boxes_gt,boxes_3d, keypoints,dds, inputs):

                            dic_jo[phase]['kps'].append(keypoint) 
                            #print(keypoint)
                            #print(box_gt)
                            if len(my_input)!=18:
                                continue;
                            dic_jo[phase]['X'].append(my_input)
                            dic_jo[phase]['Y'].append(dd)  # Trick to make it (nn,1)
                            dic_jo[phase]['names'].append(name)  # One image name for each annotation
                            dic_jo[phase]['boxes_3d'].append(box_3d)
                            dic_jo[phase]['K'].append(kk)
                            append_cluster(dic_jo, phase, my_input ,dd, keypoint)
                            cnt_ann += 1
                            sys.stdout.write('\r' + 'Saved annotations {}'.format(cnt_ann) + '\t')
                        
                        
                current_token = sample_dic['next']

        with open(os.path.join(path_joints), 'w') as f:
            json.dump(dic_jo, f)
        with open(os.path.join(path_names), 'w') as f:
            json.dump(dic_names, f)
        end = time.time()

        print("\nSaved {} annotations for {} samples in {} scenes. Total time: {:.1f} minutes"
              .format(cnt_ann, cnt_samples, cnt_scenes, (end-start)/60))
        print("\nOutput files:\n{}\n{}\n".format(path_names, path_joints))

In [23]:
def preprocess_monoloco(keypoints, kk):

    """ Preprocess batches of inputs
    keypoints = torch tensors of (m, 3, 14)  or list [3,14]
    Outputs =  torch tensors of (m, 34) in meters normalized (z=1) and zero-centered using the center of the box
    """
    if isinstance(keypoints, list):
        keypoints = torch.tensor(keypoints)
    if isinstance(kk, list):
        kk = torch.tensor(kk)
    # Projection in normalized image coordinates and zero-center with the center of the bounding box
    uv_center = get_keypoints(keypoints, mode='center')
    xy1_center = pixel_to_camera(uv_center, kk, 10)
    xy1_all = pixel_to_camera(keypoints[:, 0:2, :], kk, 10)
    kps_norm = xy1_all - xy1_center.unsqueeze(1)  # (m, 17, 3) - (m, 1, 3)
    kps_out = kps_norm[:, :, 0:2].reshape(kps_norm.size()[0], -1)  # no contiguous for view
    return kps_out 

def prepare_kps(kps_in):
    """Convert from a list of 18 to a list of 3, 9"""

    kps = []
    for kp_in in kps_in :
        assert len(kp_in) % 2 == 0, "keypoints expected as a multiple of 2"
        xxs = kp_in[0:][::2]
        yys = kp_in[1:][::2]  # from offset 1 every 2
        ccs = [1]*len(kp_in[1:][::2])
        kps.append([xxs, yys, ccs])
    return kps


def pixel_to_camera(uv_tensor, kk, z_met):
    """
    Convert a tensor in pixel coordinate to absolute camera coordinates
    It accepts lists or torch/numpy tensors of (m, 2) or (m, x, 2)
    where x is the number of keypoints
    """
    if isinstance(uv_tensor, (list, np.ndarray)):
        uv_tensor = torch.tensor(uv_tensor)
    if isinstance(kk, list):
        kk = torch.tensor(kk)
    if uv_tensor.size()[-1] != 2:
        uv_tensor = uv_tensor.permute(0, 2, 1)  # permute to have 2 as last dim to be padded
        assert uv_tensor.size()[-1] == 2, "Tensor size not recognized"
    uv_padded = F.pad(uv_tensor, pad=(0, 1), mode="constant", value=1)  # pad only last-dim below with value 1

    kk_1 = torch.inverse(kk)
    xyz_met_norm = torch.matmul(uv_padded, kk_1.t())  # More general than torch.mm
    xyz_met = xyz_met_norm * z_met

    return xyz_met


def get_keypoints(keypoints, mode):
    """
    Extract center, shoulder or hip points of a keypoint
    Input --> list or torch/numpy tensor [(m, 3, 9) or (3, 9)]
    Output --> torch.tensor [(m, 2)]
    """
    if isinstance(keypoints, (list, np.ndarray)):
        keypoints = torch.tensor(keypoints)
    if len(keypoints.size()) == 2:  # add batch dim
        keypoints = keypoints.unsqueeze(0)
        
    assert len(keypoints.size()) == 3 and keypoints.size()[1] == 3, "tensor dimensions not recognized"
    assert mode in ['center', 'bottom', 'head', 'shoulder', 'hip', 'ankle']

    kps_in = keypoints[:, 0:2, :]  # (m, 2, 9)
    if mode == 'center':
        kps_max, _ = kps_in.max(2)  # returns value, indices
        kps_min, _ = kps_in.min(2)
        kps_out = (kps_max - kps_min) / 2 + kps_min   # (m, 2) as keepdims is False

    elif mode == 'bottom':  # bottom center for kitti evaluation
        kps_max, _ = kps_in.max(2)
        kps_min, _ = kps_in.min(2)
        kps_out_x = (kps_max[:, 0:1] - kps_min[:, 0:1]) / 2 + kps_min[:, 0:1]
        kps_out_y = kps_max[:, 1:2]
        kps_out = torch.cat((kps_out_x, kps_out_y), -1)

    elif mode == 'head':
        kps_out = kps_in[:, :, 0:5].mean(2)

    elif mode == 'shoulder':
        kps_out = kps_in[:, :, 5:7].mean(2)

    elif mode == 'hip':
        kps_out = kps_in[:, :, 11:13].mean(2)

    elif mode == 'ankle':
        kps_out = kps_in[:, :, 15:17].mean(2)

    return kps_out  # (m, 2)


In [24]:
def factory(dataset, dir_nuscenes):
    """Define dataset type and split training and validation"""

    assert dataset in ['nuscenes', 'nuscenes_mini', 'nuscenes_teaser']
    if dataset == 'nuscenes_mini':
        version = 'v1.0-mini'
    else:
        version = 'v1.0-trainval'

    nusc = NuScenes(version=version, dataroot=dir_nuscenes, verbose=True)
    scenes = nusc.scene

    if dataset == 'nuscenes_teaser':
        with open("./splits/nuscenes_teaser_scenes.txt", "r") as file:
            teaser_scenes = file.read().splitlines()
        scenes = [scene for scene in scenes if scene['token'] in teaser_scenes]
        with open("./splits/split_nuscenes_teaser.json", "r") as file:
            dic_split = json.load(file)
        split_train = [scene['name'] for scene in scenes if scene['token'] in dic_split['train']]
        split_val = [scene['name'] for scene in scenes if scene['token'] in dic_split['val']]
    else:
        split_scenes = splits.create_splits_scenes()
        split_train, split_val = split_scenes['train'], split_scenes['val']

    return nusc, scenes, split_train, split_val

In [25]:
now = datetime.datetime.now()
now_time = now.strftime("%Y%m%d-%H%M")[2:]
path_joints = os.path.join(dir_out, 'joints-' + dataset + '-' + now_time + '.json')
path_names = os.path.join(dir_out, 'names-' + dataset + '-' + now_time + '.json')

nusc, scenes, split_train, split_val = factory(dataset,dir_nuscenes)

Loading NuScenes tables for version v1.0-trainval...
23 category,
8 attribute,
4 visibility,
64386 instance,
12 sensor,
10200 calibrated_sensor,
2631083 ego_pose,
68 log,
850 scene,
34149 sample,
2631083 sample_data,
1166187 sample_annotation,
4 map,
Done loading in 35.2 seconds.
Reverse indexing ...
Done reverse indexing in 11.6 seconds.


In [26]:
run()

Elaborating scene 1, remaining time NaN minutes	
Elaborating scene 2, remaining time 1.71 minutes	
Elaborating scene 3, remaining time 3.56 minutes	
Elaborating scene 4, remaining time 9.32 minutes	
Elaborating scene 5, remaining time 2.99 minutes	
Elaborating scene 6, remaining time 5.45 minutes	
Elaborating scene 7, remaining time 2.40 minutes	
Elaborating scene 8, remaining time 1.86 minutes	
Elaborating scene 9, remaining time 1.17 minutes	
Elaborating scene 10, remaining time 1.16 minutes	
Elaborating scene 11, remaining time 2.27 minutes	
Elaborating scene 12, remaining time 0.33 minutes	
Elaborating scene 13, remaining time 1.30 minutes	
phase name not in training or validation split
Elaborating scene 14, remaining time 4.00 minutes	
phase name not in training or validation split
Elaborating scene 15, remaining time 2.54 minutes	
Elaborating scene 16, remaining time 2.42 minutes	
Elaborating scene 17, remaining time 5.96 minutes	
Elaborating scene 18, remaining time 7.43 minutes