# Perform inference

This code allows to perform inference of CNN-ViT models. The Notebook can run on both hosted and local runtimes.

# Preliminaries

Install required packages.

In [None]:
!pip install git+https://github.com/Microsatellites-and-Space-Microsystems/pose_estimation_domain_gap --quiet

Provide access to Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Set the paths to NN weights.

In [None]:
weight_path = '/content/gdrive/MyDrive/.../my_first_CNN_ViT.h5'

#Output file will be in json format
inference = 'sunlamp' #sunlamp / lightbox
images_folder = '/content/gdrive/MyDrive/SPEC21_test_images/'+inference
json_dest_file = '/content/gdrive/MyDrive/my_first_cnnvit_inference_'+inference+'.json'

# Initialize model

Initialize the encoder (EfficientNet backbone + ViT).

In [None]:
from models_and_layers.efficientnet_lite import EfficientNetLiteB4
from models_and_layers.vit_layers import AddPositionEmbs, TransformerBlock
import tensorflow as tf

#Code adapted from https://github.com/faustomorales/vit-keras
#Licensed under Apache 2.0 license
#Removed classToken

def build_encoder(
    input_shape=(320, 512, 3),
    patch_size=4,
    num_layers=6,
    hidden_size=256,
    num_heads=8,
    mlp_dim=2048,
    dropout=0.1
):
    """Build transformer encoder.

    Args:
        input_shape: The size of input images.
        patch_size: The size of each patch (must fit evenly in image_size)
        num_layers: The number of transformer layers to use.
        hidden_size: The number of filters to use
        num_heads: The number of transformer heads
        mlp_dim: The number of dimensions for the MLP output in the transformers.
        dropout_rate: fraction of the units to drop for dense layers.
    """
    
    inputlayer=tf.keras.layers.Input(shape=(input_shape[0], input_shape[1], 3))

    model = EfficientNetLiteB4(weights=None, input_shape=(input_shape[0], input_shape[1], 3),include_top=False)(inputlayer)
    #model=tf.keras.models.Model(inputs=model.input,outputs=model.layers[-1].output)(inputlayer)
    #x = tf.keras.layers.Conv2D(64,1)(model)
    #x = tf.keras.layers.Input(shape=(image_size[0], image_size[1], 3))
    y = tf.keras.layers.Conv2D(
        filters=hidden_size,
        kernel_size=patch_size,
        strides=patch_size,
        padding="valid",
        name="embedding",
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=1116),
    )(model)
    y = tf.keras.layers.Reshape((y.shape[1] * y.shape[2], hidden_size))(y)

    y = AddPositionEmbs(name="Transformer/posembed_input")(y)
    for n in range(num_layers):
        y, _ = TransformerBlock(
            num_heads=num_heads,
            mlp_dim=mlp_dim,
            dropout=dropout,
            name=f"Transformer/encoderblock_{n}",
        )(y)
    y = tf.keras.layers.LayerNormalization(
        epsilon=1e-6, name="Transformer/encoder_norm"
    )(y)

    y=tf.keras.layers.GlobalAveragePooling1D()(y)
    #y=tf.keras.layers.Flatten()(y)
    
    return tf.keras.models.Model(inputs=inputlayer, outputs=y)

Initialize regression head.

In [None]:
class kpts_regressor(tf.keras.Model):
  def __init__(self,hidden_dim,num_keypoints):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.num_keypoints = num_keypoints
        self.basic_layers = tf.keras.Sequential(
            [tf.keras.layers.Dropout(0.1,seed=43),
             tf.keras.layers.Dense(self.hidden_dim,activation='gelu',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=9001)),
          tf.keras.layers.Dropout(0.1,seed=819),
          tf.keras.layers.Dense(self.hidden_dim/2,activation='gelu',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=901)),
          tf.keras.layers.Dense(22,kernel_initializer=tf.keras.initializers.GlorotUniform(seed=976),name='kpts'),
         ]
        )
  def call(self, x):
    x = self.basic_layers(x)
    return x

Build the model and restore weights.

In [None]:
#Vit tiny:
hidden_dim=192
num_keypoints = 11
input_shape=[320, 512, 3]
inputlayer=tf.keras.layers.Input(shape=(input_shape[0], input_shape[1], 3))

encoder=test=build_encoder(input_shape=(320, 512, 3),
    patch_size=1,
    num_layers=1,
    hidden_size=hidden_dim,
    num_heads=3,
    mlp_dim=hidden_dim*3,
    dropout=0.1
  )(inputlayer)
encoder=tf.keras.models.Model([inputlayer], [encoder])
regressor_kpts = kpts_regressor(hidden_dim,num_keypoints)(encoder.output)
network=tf.keras.models.Model([encoder.input], [regressor_kpts])

# Restore the weights

network.load_weights(weight_path)

# Tango model and camera matrix

In [None]:
#SPEED 3D Model

import numpy as np

# Camera matrix updated to SPEED+
cameraMatrix=np.array([[2988.579516381556, 0, 960],[0,2988.340115917612, 600],[0,0,1]])

#(k1,k2,p1,p2[,k3])
distCoeffs = np.array([-0.223830166065107, 0.514097970891064, -6.649961199834066e-04, -2.140477166748459e-04, -0.131242274290774])
#Points coordinates on Tango's frame:

#Create a np array "objectPoints" with size num_keypoints x 3 (x, y, z coordinates) containing the satellite 3D model (keypoints coordinates)

#objectPoints=...
objectPoints=objectPoints.reshape(11,3)


# Load images

In [None]:
import os

img=[]
for path in os.listdir(images_folder):
    full_path = os.path.join(images_folder, path)
    if os.path.isfile(full_path) and os.path.splitext(full_path)[1]=='.jpg':
        img.append(full_path)

print(len(img))

# Run inference

In [None]:
import time
from PIL import Image
import cv2
from scipy.spatial.transform import Rotation as Rot

image_width = 1920
image_height = 1200

np.set_printoptions(threshold=60000)

num_keypoints = 11

#Initialize variables to export data
images_names=[]

image_overall_time=np.zeros((len(img),1))
export_keypoints=np.zeros((len(img),num_keypoints*2))
network_inference_time=np.zeros((len(img),1))

load_image_time=np.zeros((len(img),1))

export_PnP_success=np.zeros((len(img),1))
export_position=np.zeros((len(img),3))
export_quat=np.zeros((len(img),4))

export_inliers_nr=np.zeros((len(img),1))

export_position_LM=np.zeros((len(img),3))
export_quat_LM=np.zeros((len(img),4))
PnP_time=np.zeros((len(img),1))
LM_time=np.zeros((len(img),1))

inliers_indices=np.zeros((1,11))

i=-1
for image_path in img:
    i+=1
    print(i)
    
    images_names.append(os.path.basename(image_path))
    image_time_start = time.time()
    
    image_pil=Image.open(image_path)
    image=np.asarray(image_pil) 
    image=np.expand_dims(image,-1)

    load_image_time[i,:]=time.time()-image_time_start

    image=tf.image.resize(image,
                          [320,512],
                          method=tf.image.ResizeMethod.BILINEAR,
                          antialias=False
    )
    
    image=(image - 127.00) / 128.00

    image = tf.image.grayscale_to_rgb(image)
    image=np.expand_dims(image,0)

    start = time.time()
    output=network(image)
    network_inference_time[i,:]=time.time()-start

    keypoints=output.numpy()

    keypoints=np.reshape(keypoints,[num_keypoints*2,1])

    keypoints=keypoints.reshape(num_keypoints,2)

    keypoints[:,0]=keypoints[:,0]*1920
    keypoints[:,1]=keypoints[:,1]*1200

    export_keypoints[i,:]=keypoints.reshape(1,num_keypoints*2)

    start=time.time()
    success, R_vec, t_vec, inliers = cv2.solvePnPRansac(objectPoints,keypoints,cameraMatrix,distCoeffs,flags=cv2.SOLVEPNP_EPNP,reprojectionError=5)
    PnP_time[i,:]=time.time()-start

    Rotation_matrix, _ = cv2.Rodrigues(R_vec)
    scipy_rotation_matrix=Rot.from_matrix(Rotation_matrix)
    quat=scipy_rotation_matrix.as_quat()

    if success==True:
        export_PnP_success[i,:]=success
        export_inliers_nr[i,:]=len(inliers)
    
    export_position[i,:]=t_vec.transpose()
    export_quat[i,:]=quat

    start=time.time()
    R_vec, t_vec=cv2.solvePnPRefineLM(objectPoints[inliers,:],keypoints[inliers,:],cameraMatrix,distCoeffs,R_vec, t_vec)
    LM_time[i,:]=time.time()-start

    Rotation_matrix_LM, _ = cv2.Rodrigues(R_vec)
    scipy_rotation_matrix_LM=Rot.from_matrix(Rotation_matrix_LM)
    quat_LM=scipy_rotation_matrix_LM.as_quat()
    export_position_LM[i,:]=t_vec.transpose()
    export_quat_LM[i,:]=quat_LM

    image_overall_time[i,:]=time.time()-image_time_start

In [None]:
import json

i=-1;
for item in images_names:
  i=i+1;
  if i==0:
    data_all=[{
      'image': images_names[i],
      'network_inference_time': (network_inference_time[i]).tolist(),
      'keypoints': (export_keypoints[i]).tolist(),
      'PnP_success': (export_PnP_success[i]).tolist(),
      'PnP_inliers_nr': (export_inliers_nr[i]).tolist(),
      'PnP_time': (PnP_time[i]).tolist(),
      'LM_time': (LM_time[i]).tolist(),
      'Load_time': (load_image_time[i]).tolist(),
      'position': (export_position[i]).tolist(),
      'quaternions': (export_quat[i]).tolist(),
      'position_LM': (export_position_LM[i]).tolist(),
      'quaternions_LM': (export_quat_LM[i]).tolist(),
      'overall_image_time': (image_overall_time[i]).tolist(),
    }]
  else:
    data_item={
      'image': images_names[i],
      'network_inference_time': (network_inference_time[i]).tolist(),
      'keypoints': (export_keypoints[i]).tolist(),
      'PnP_success': (export_PnP_success[i]).tolist(),
      'PnP_inliers_nr': (export_inliers_nr[i]).tolist(),
      'PnP_time': (PnP_time[i]).tolist(),
      'LM_time': (LM_time[i]).tolist(),
      'Load_time': (load_image_time[i]).tolist(),
      'position': (export_position[i]).tolist(),
      'quaternions': (export_quat[i]).tolist(),
      'position_LM': (export_position_LM[i]).tolist(),
      'quaternions_LM': (export_quat_LM[i]).tolist(),
      'overall_image_time': (image_overall_time[i]).tolist(),
    }
    data_all.append(data_item)  

parsed = json.dumps(data_all,indent=4)

with open(os.path.join(json_dest_file),'w') as f:
        f.write(parsed)