In [1]:
%load_ext autoreload
%autoreload 2

# 1. Patch descriptors register in 3D for 642 templates

In [2]:
import glob
import os

# Load templates
template_path = "datasets/bop23_challenge/datasets/templates_pyrender/icbin_642/obj_000001"
template_files = sorted(glob.glob(os.path.join(template_path, "*.png")), key=os.path.getmtime)

In [3]:
from PIL import Image
import numpy as np

templates = [np.array(Image.open(template_file).convert("RGB").resize((420,420)))[:,:,:3] for template_file in template_files] # This image has 4 channels- the last one is not crucial - maybe about opacity

In [4]:
import torchvision.transforms as T
import torch.nn.functional as F
import torch

def patches_feature_extraction(template_patches, dinov2_vitl14, device):
    # crop_rgb: numpy array
    # temps = np.transpose(np.array(template_patches), (0,2,3,1))
    rgb_normalize = T.Compose(
        [
            T.ToTensor(),
            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ]
    )
    normalized_patches = torch.stack([rgb_normalize(patch) for patch in template_patches])
    layers_list = list(range(18))
    torch.cuda.empty_cache()
    with torch.no_grad(): 
        feature_patches= dinov2_vitl14.module.get_intermediate_layers(normalized_patches.to(device), n=layers_list, reshape=True)
    return feature_patches

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
dinov2_vitl14.patch_size = 14
if torch.cuda.is_available():
    dinov2_vitl14 = torch.nn.DataParallel(dinov2_vitl14).to(device)  # Use DataParallel for multiple GPUs

Using cache found in /home/cuong.vandam/.cache/torch/hub/facebookresearch_dinov2_main


In [6]:
batch_size = 100
template_batches = [templates[i:i+batch_size] for i in range(0, len(templates), batch_size)]
patch_features= list()

for batch in template_batches:
    torch.cuda.empty_cache()
    batch_feature = patches_feature_extraction(batch, dinov2_vitl14, device)
    patch_features.append(batch_feature[0].to('cpu'))
    del batch_feature
    
patch_features = torch.cat(patch_features).permute(0,2,3,1).view(-1,30*30,1024)

In [7]:
del dinov2_vitl14

In [8]:
patch_features.shape

torch.Size([642, 900, 1024])

In [9]:
# Given only 30 first patches are valid
valid_patch_features = patch_features[:, :30,].reshape(-1,1024)
valid_patch_features.shape

torch.Size([19260, 1024])

In [20]:
# Number of valid patches on each template - here given 30 for each templates
num_valid_patches = [30]*642
print(len(num_valid_patches), num_valid_patches)

642 [30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,

In [10]:
import numpy as np
from sklearn.decomposition import PCA

# Apply PCA with the 256 components to reduce dimensionality of the features
pca = PCA(n_components=256)
pca_patches_descriptors = pca.fit_transform(np.array(valid_patch_features.cpu()))
pca_patches_descriptors.shape

(19260, 256)

# 2. Perform Kmean clustering for all patch descriptors from templates (2048 clusters)

In [19]:
# https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
import faiss
ncentroids = 2048
niter = 20
verbose = True
d = pca_patches_descriptors.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, gpu=True)
kmeans.train(pca_patches_descriptors)

Clustering 19260 points in 256D to 2048 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.00 s
  Iteration 19 (0.13 s, search 0.08 s): objective=4683.79 imbalance=1.247 nsplit=0       




4683.7861328125

In [25]:
# Assign labels to the data points
labels = kmeans.index.search(pca_patches_descriptors, 1)[1]
labels.shape

(19260, 1)

In [33]:
templates_labels = list()
start_idx = 0
for num in num_valid_patches:
    end_idx = start_idx + num
    template_labels = labels[start_idx:end_idx].reshape(-1)
    templates_labels.append(template_labels)
    start_idx = end_idx

In [41]:
templates_labels

[array([1403,   37, 1605, 1024, 1414, 1060, 1109,  507,  638, 1282, 1186,
        1527, 1851,  579,   74, 1105, 1586,  297, 1290, 1500, 1797,  181,
        1839, 1413, 1928, 1103,  402, 1245, 1892, 1962]),
 array([1115, 1396, 1380, 1719, 1171,  549, 1314, 1422, 1081,  270,   62,
         816, 1971, 1794, 1185,  718,  909,  522, 1915,  550,  989, 1356,
         516, 1924,  511,  418, 1227,  241, 1613, 1988]),
 array([ 204, 1396, 1605,  858, 1414, 1665,  997, 1807,  939, 1236,  609,
        1079, 1910, 1113, 1698, 1238,  821,  297, 1328,  680, 1891, 1443,
        1817, 1413,  613, 1672,  402, 1838,  154, 1962]),
 array([1403,   37, 1605,  465, 1592, 1060, 1109,  340,  638,  690, 1186,
        1527, 1851, 1792,   74, 1105, 1586,  297,  784, 1500, 1797,  244,
        1817, 1413, 1928, 1103,  402, 1245, 1892,  677]),
 array([1115, 1396, 1380, 1719, 1171,  549, 1314, 1422, 1669,  270,   62,
         932, 1971, 1794,  266,  718,  768,  385, 1915,  550,  989,  883,
         516, 1924,  264,  4

In [43]:
import math

def calculate_templates_vector(templates_labels, num_clusters = 2048):
    # Calculate bag-of-words descriptors of the templates

    templates_vector = list()
    all_occurrences = [np.bincount(templates_label, minlength=2048) for templates_label in templates_labels]
    ni_array = np.sum(np.array(all_occurrences), axis = 0)
    N = len(templates_labels) # Number of templates
    for t in range(len(templates_labels)):
        template_vector = list()
        occurrences = np.bincount(templates_labels[t], minlength=2048)
        for i in range(num_clusters):
            n_it = occurrences[i]
            nt = len(templates_labels[t])
            ni = ni_array[i]
            bi = n_it / nt * math.log(N / ni)
            template_vector.append(bi)
        templates_vector.append(np.array(template_vector))
    return templates_vector
templates_vector = calculate_templates_vector(templates_labels = templates_labels, num_clusters = 2048)

In [53]:
np.max(templates_vector[0])

0.19238137077100054

# 3. Retrieving similar templates

In [54]:
# Load image crop 
crop_rgb = np.array(Image.open("cnos_analysis/crop_proposals/crop1.png").convert("RGB").resize((420,420))) # (124, 157, 3)