# Product Recognition on Store Shelves

### Marco Scaramuzzi 
- Student ID: 0001057167
- email: marco.scaramuzzi@studio.unibo.it

## Task
Develop a computer vision system that, given a reference image for each product, is able to identify boxes of cereals of different brands from one picture of a store shelf. For each type of product displayed in the shelf the system should report:

1. Number of instances.
2. Dimension of each instance (width and height of the bounding box that enclose them in pixel).

#### Train on model image: {0.png, 1.png, 11.png, 19.png, 24.png, 25.png, 26.png}
#### Test on scene image: {e1.png, e2.png, e3.png, e4.png, e5.png}


In [3]:
#Import required modules
import numpy as np
from matplotlib import pyplot as plt
import cv2
import os
from typing import Tuple
from collections import defaultdict
from glob import glob

# Only for jupyter notebook visualization
%matplotlib inline


from utils import * 
from image_loading import *

%load_ext autoreload
%autoreload 2

# Enviromental variables

## Models

Loading model image

In [4]:
model_names = ["0.jpg", "1.jpg", "11.jpg", "19.jpg", "24.jpg", "25.jpg", "26.jpg"]
model_files = {name: cv2.imread(f"models/{name}") for name in model_names}


# Caricamento immagini modello
models_rgb, models_gray = load_model_images(model_files)

# Separazione dei canali (R, G, B)
red_models, green_models, blue_models = split_channels(models_rgb)

In [5]:
# show_images(models_rgb)

Computing model keypoints for each channel with SIFT

In [6]:
# 🟢 1️⃣ Inizializzazione di FLANN
def initialize_flann():
    """
    Inizializza il matcher FLANN con parametri predefiniti.

    Returns:
        cv2.FlannBasedMatcher: Istanza di FLANN Matcher.
    """
    index_params = dict(algorithm=1, trees=5)  # FLANN_INDEX_KDTREE = 1
    search_params = dict(checks=50)
    return cv2.FlannBasedMatcher(index_params, search_params)

def extract_sift_features(image_channels):
    """
    Estrae keypoints e descrittori SIFT per ogni canale dell'immagine.

    Parameters:
        image_channels (list): Lista di immagini (uno per ogni canale R, G, B).

    Returns:
        tuple: (Lista keypoints, Lista descrittori)
    """
    sift = cv2.SIFT_create(contrastThreshold=0.02, edgeThreshold=10)  # 🔹 Più keypoints

    keypoints, descriptors = [], []
    
    for channel in image_channels:
        kp, des = sift.detectAndCompute(channel, None)
        keypoints.append(kp)
        descriptors.append(des if des is not None else np.array([]))  # 🔹 Evita None

    return keypoints, descriptors


In [7]:
# Estrazione feature SIFT per ciascun canale

kp_model_red, des_model_red = extract_sift_features(red_models)
kp_model_green, des_model_green = extract_sift_features(green_models)
kp_model_blue, des_model_blue = extract_sift_features(blue_models)
# Calcolo colori di riferimento
# reference_colors = compute_reference_colors(models_rgb)

### Scene

In [8]:
scene_rgb, scene_gray = load_scene("m5", scene_dir="scenes/step_B/") # image m5.png

# Separazione canali scena
red_scene, green_scene, blue_scene = split_scene_channels(scene_rgb)

In [9]:
kp_scene_red, des_scene_red = extract_sift_features([red_scene])
kp_scene_green, des_scene_green = extract_sift_features([green_scene])
kp_scene_blue, des_scene_blue = extract_sift_features([blue_scene])


### Matching step

In [10]:
# Inizializzazione FLANN
flann = initialize_flann()

In [15]:
def match_keypoints(flann, model_descriptors, scene_descriptors):
    """
    Esegue il matching dei keypoints tra modello e scena per ciascun canale (R, G, B).

    Parameters:
        flann (cv2.FlannBasedMatcher): Matcher FLANN.
        model_descriptors (tuple): Tuple con descrittori dei modelli per (R, G, B).
        scene_descriptors (tuple): Tuple con i descrittori della scena per (R, G, B).

    Returns:
        list: Lista dei match combinati per ogni modello.
    """
    num_models = len(model_descriptors[0])  
    matches = [[] for _ in range(num_models)]

    for i in range(num_models):
        matches_red = []
        matches_green = []
        matches_blue = []

        for c, matches_channel in zip(range(3), [matches_red, matches_green, matches_blue]):
            d_model = model_descriptors[c][i]
            d_scene = scene_descriptors[c]

            # Controlliamo che i descrittori non siano None o vuoti
            if d_model is None or d_scene is None or len(d_model) == 0 or len(d_scene) == 0:
                print(f"⚠️ Warning: descrittori assenti per modello {i}, canale {c}. Skipping.")
                matches_channel.append([])
                continue

            # Conversione in float32 per OpenCV
            d_model = np.asarray(d_model, dtype=np.float32)
            d_scene = np.asarray(d_scene, dtype=np.float32)

            # Matching con FLANN
            matches_channel.extend(flann.knnMatch(d_model, d_scene, k=3))

        matches[i] = matches_red + matches_green + matches_blue  # Unione dei match dei tre canali

    return matches



def extract_matched_keypoints(matches, model_keypoints, scene_keypoints):
    """
    Estrae i keypoints corrispondenti dal modello e dalla scena.

    Parameters:
        matches (list): Lista dei match tra modello e scena.
        model_keypoints (tuple): Tuple con i keypoints del modello per (R, G, B).
        scene_keypoints (tuple): Tuple con i keypoints della scena per (R, G, B).

    Returns:
        tuple: (Keypoints del modello, Keypoints della scena)
    """
    num_models = len(matches)
    matched_model_pts = [[] for _ in range(num_models)]
    matched_scene_pts = [[] for _ in range(num_models)]

    for i in range(num_models):
        total_matches = [m for triplet in matches[i] for m in triplet]  # Flatten dei match dei 3 canali
        
        # Estrazione dei keypoints del modello (prendiamo 1 match ogni 3 per evitare duplicazioni)
        matched_model_pts[i] = np.array([(model_keypoints[0][i][match.queryIdx].pt,
                                          model_keypoints[0][i][match.queryIdx].size)
                                         for match in total_matches[::3]])  # Prende solo ogni 3 match

        # Estrazione dei keypoints della scena (prendiamo tutti i match)
        matched_scene_pts[i] = np.array([(scene_keypoints[match.trainIdx].pt,
                                          scene_keypoints[match.trainIdx].size)
                                         for match in total_matches])

    return matched_model_pts, matched_scene_pts


In [16]:

# 🟢 Esecuzione del codice
num_models = len(des_model_red)  # Ora il codice è scalabile!

# 1️⃣ Matching tra modello e scena
matches_list = match_keypoints(flann, 
                               (des_model_red, des_model_green, des_model_blue), 
                               (des_scene_red, des_scene_green, des_scene_blue))

# 2️⃣ Estrazione dei keypoints corrispondenti
src_pts_matches, dst_pts_matches = extract_matched_keypoints(matches_list, 
                                                             (kp_model_red, kp_model_green, kp_model_blue), 
                                                             (kp_scene_red, kp_scene_green, kp_scene_blue))

# 3️⃣ Stampa del numero di keypoints trovati per ogni modello
for i in range(num_models):
    print(f"Modello {i}: {len(src_pts_matches[i])} keypoints modello, {len(dst_pts_matches[i])} keypoints scena")


error: OpenCV(4.8.0) D:\a\opencv-python\opencv-python\opencv\modules\flann\src\miniflann.cpp:336: error: (-210:Unsupported format or combination of formats) in function 'cv::flann::buildIndex_'
> type=1021
> 

In [None]:

# 🟢 Esecuzione del codice
num_models = len(des_model_red)  # Ora il codice è scalabile!

# 1️⃣ Matching tra modello e scena
matches_list = match_keypoints(flann, 
                               (des_model_red, des_model_green, des_model_blue), 
                               (des_scene_red, des_scene_green, des_scene_blue))

# 2️⃣ Estrazione dei keypoints corrispondenti
src_pts_matches, dst_pts_matches = extract_matched_keypoints(matches_list, 
                                                             (kp_model_red, kp_model_green, kp_model_blue), 
                                                             (kp_scene_red, kp_scene_green, kp_scene_blue))

# 3️⃣ Stampa del numero di keypoints trovati per ogni modello
for i in range(num_models):
    print(f"Modello {i}: {len(src_pts_matches[i])} keypoints modello, {len(dst_pts_matches[i])} keypoints scena")


error: OpenCV(4.8.0) D:\a\opencv-python\opencv-python\opencv\modules\flann\src\miniflann.cpp:336: error: (-210:Unsupported format or combination of formats) in function 'cv::flann::buildIndex_'
> type=1021
> 

## GHT Step

In [None]:
def build_r_table(center, source_vectors):
    """
    Generates an R-Table for Generalized Hough Transform.
    
    Parameters:
        center (tuple): (x, y) coordinates of the centroid.
        source_vectors (numpy.ndarray): Array of shape (N, 2) containing keypoint positions of the models.

    Returns:
        defaultdict(list): R-Table with distances from keypoints to the centroid.
    """
    r_table = defaultdict(list)

    # Convert source_vectors to NumPy array for efficiency
    source_vectors = np.array(source_vectors)

    # Compute distance vectors in one vectorized operation
    delta_x = center[0] - source_vectors[:, 0]
    delta_y = center[1] - source_vectors[:, 1]

    # Stack results efficiently
    distances = np.column_stack((delta_x, delta_y, np.ones_like(delta_x)))  # Assume size = 1 for now

    # Store each keypoint's distances in the R-table
    for index, (dx, dy, size) in enumerate(distances):
        r_table[index].append((dx, dy, size))

    return r_table


def accumulate_votes(r_table, shelf_image, scene_keypoints):
    """
    Casts votes for barycentre position using the R-Table.
    
    Parameters:
        r_table (defaultdict(list)): The R-Table storing model keypoints' vectors.
        shelf_image (numpy.ndarray): The image of the shelf (used to get accumulator size).
        scene_keypoints (numpy.ndarray): Array of shape (N, 2) with detected keypoints in the scene.
    
    Returns:
        numpy.ndarray: The accumulator matrix where the most voted position is likely the barycentre.
    """
    # Initialize accumulator
    accumulator = np.zeros(shelf_image.shape[:2], dtype=np.int32)

    # Ensure keypoints are a NumPy array
    scene_keypoints = np.array(scene_keypoints)

    # Loop through each keypoint in the scene
    for idx, (scene_pos, scene_size) in enumerate(scene_keypoints):
        x_scene, y_scene = scene_pos

        # Find the corresponding model keypoint index
        model_index = idx // 3  # Assuming each model keypoint has 3 associated scene keypoints

        if model_index not in r_table:
            continue  # Skip if the model keypoint is not in the R-Table

        # Get the precomputed vectors from the R-Table
        for dx, dy, model_size in r_table[model_index]:
            scale_ratio = scene_size / model_size  # Scale factor
            x_accum = int(round(x_scene + scale_ratio * dx))
            y_accum = int(round(y_scene + scale_ratio * dy))

            # Ensure we don't go out of bounds
            if 0 <= x_accum < accumulator.shape[1] and 0 <= y_accum < accumulator.shape[0]:
                accumulator[y_accum, x_accum] += 1  # Increment vote at calculated barycentre position

    return accumulator



def plot_accumulator(accumulator, shelf_image):
    """
    Plots the accumulator on a white image and finds barycentre candidates.
    
    Parameters:
        accumulator (numpy.ndarray): The voting accumulator matrix.
        shelf_image (numpy.ndarray): The shelf image (used to get the shape).
    
    Returns:
        list: List of (x, y) barycentre candidate coordinates.
    """
    # Copy shelf image and fill with white
    height, width = accumulator.shape
    white_image = np.full_like(shelf_image, 255)  # Faster than .copy() + .fill(255)
    
    # Find all candidate barycentres (votes > 2)
    barycentres = np.argwhere(accumulator > 2)
    
    # Draw black points where votes > 0
    white_image[accumulator > 0] = 0
    
    # Draw diagonal cross around each barycentre
    radius = 10
    for y, x in barycentres:
        # Define safe bounding box using np.clip
        y_start, y_end = np.clip([y - radius, y + radius], 0, height - 1)
        x_start, x_end = np.clip([x - radius, x + radius], 0, width - 1)
        
        # Draw diagonal cross using slicing
        white_image[y_start:y_end, x_start:x_end] = 0
        white_image[y_start:y_end, x_end:x_start:-1] = 0  # Reverse diagonal
        
    # Plot the image
    plt.figure(figsize=(20, 10))
    plt.imshow(white_image, cmap='gray', vmin=0, vmax=255)
    plt.show()
    
    return barycentres.tolist()



In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict




In [None]:


# Costruzione della R-Table e accumulatore
r_table_list = [build_r_table((model.shape[0] // 2, model.shape[1] // 2), src_pts_matches[0][i])
                for i, model in enumerate(red_models)]

accumulator_list = [accumulate_kp(r_table_list[i], scene_gray, dst_pts_matches[0][i]) 
                    for i in range(len(red_models))]

# Identificazione dei baricentri
barycentres_list = detect_barycentres(accumulator_list, scene_gray)

print("Identificati baricentri:", barycentres_list)
