# C1 - Content Based Image Retrieval
### Team 8 - Week 3

In [31]:
import numpy as np
import cv2
import glob
import tqdm
import pickle
import os
import math
import matplotlib.pyplot as plt
import pytesseract
import re
from Levenshtein import distance as levenshtein_distance
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Luis\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

In [2]:
name_bag = set()
for folder in ['BBDD', 'qsd1_w3', 'qsd2_w3']:
    for text_file in glob.glob(f'data/{folder}/*.txt'):
        # read text file
        with open(text_file, 'r') as f:
            line = f.readlines()
        
        for l in line:
            if re.search(r"\('([^']+)'", l.split(',')[0]):
                author = re.search(r"\('([^']+)'", l.split(',')[0]).group(1)
                name_bag.add(author)
            else:
                name_bag.add('Unknown')

In [43]:
class DataLoader():
    def __init__(self, folder_path):
        self.folder_path = folder_path

    # Obtain the painting image removing the background. 
    # It returns the mask where 1 means painting image and 0 background.
    def get_mask(self, img, mode, threshold_area=71000):

        # Transforming color image to grayscale depending on the color space
        if mode == 'rgb':
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        elif mode == 'hsv':
            gray = img[:,:,2].copy()
        else:
            gray = img[:,:,0].copy()

        # Empty mask definition
        mask = np.zeros(gray.shape, dtype=np.uint8)

        # Applying gaussian blurring and define an intelligent gradient threshold depending on 13x13 boxes
        blur = cv2.GaussianBlur(gray, (13,13), 0)
        # Threshold based on local pixel neighborhood (11x11 block size)
        thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

        # Two pass dilate with horizontal and vertical kernel
        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,5))
        dilate = cv2.dilate(thresh, horizontal_kernel, iterations=2)
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,1))
        dilate = cv2.dilate(dilate, vertical_kernel, iterations=2)

        # Find contours, filter using contour threshold area, and draw rectangle
        cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
        '''
        cv2.RETR_EXTERNAL retrieves only the extreme outer contours.
        cv2.CHAIN_APPROX_SIMPLE compresses horizontal, vertical, and diagonal segments and leaves only their end points.
        Follows Satoshi Suzuki's algorithm:
            1- The algorithm works by border following, which means it traces the boundary of connected components in the image.
            2- It starts from the top-left corner of the image and looks for the first white pixel. Once found, it begins to follow the contour border.
            3- While following the contour, the algorithm keeps track of the direction in which it is moving to ensure it stays on the boundary.
            4- Once the entire contour is followed and the start point is reached again, the algorithm continues scanning the image for the next contour.
        '''

        #  Filtering the found contours by size
        counter = 0
        areas = []
        coordinates = []
        for c in cnts:
            # Shoelace formula for convex shapes
            area = cv2.contourArea(c) 
            if area > threshold_area:
                x,y,w,h = cv2.boundingRect(c) 
                areas.append((area, (x,y,w,h)))
                counter += 1

        # Sort areas and positions by area
        areas = sorted(areas, key=lambda x: x[0], reverse=True)

        # Draw bounding box on mask
        for i in range(min(len(areas), 2)):
            x,y,w,h = areas[i][1]
            coordinates.append((x,y,w,h))
            mask[y:y+h, x:x+w] = 255
        
        # Catching the 0 contours error
        if counter == 0:
            print('Error! No paintings in this image!')
            plt.imshow(img)
            plt.show()
            plt.imshow(mask, cmap='gray')
            plt.show()

        return mask, coordinates
    
    # Obtain the painting image removing the text. 
    # It returns the mask where 1 means painting image and 0 text.
    def get_mask_text(self, img, return_text=False):

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Applying opening, closing and dilation morphological operations with a 9x9 and 13x13 kernel
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
        opening = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
        closing = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)

        x = closing-opening
        x = (x>125).astype(np.uint8) #thresholding to get (hopefully) only the text

        kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (13,13))
        dilated = cv2.dilate(x, kernel2, iterations=2)

        # Find contours 
        ctns = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Find rectangular shapes and get the biggest one
        areas = []
        for c in ctns[0]:
            x,y,w,h = cv2.boundingRect(c)
            if w > h and w/h < 12 and (w*h)/(img.shape[0]*img.shape[1]) < 0.35:
                # Shoelace formula for convex shapes
                areas.append((cv2.contourArea(c), (x,y,w,h)))
        areas = sorted(areas, key=lambda x: x[0], reverse=True)
        x, y, w, h = areas[0][1]


        # If there is a shape on the right or left of the biggest shape found, we join it to the mask
        for _, shape in areas:
            if y > shape[1]-10 and y < shape[1]+10:
                if shape[0] < x:
                    w = (x+w) - shape[0]
                    x = shape[0]
                else:
                    w = (shape[0]+shape[2]) - x

        text = None
        if return_text:
            binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
            text = pytesseract.image_to_string(binary[y:y+h, x:x+w])
            text = re.sub(r'[0-9\n¥“«!|]', '', text)

            min_dist = 1000000
            for name in name_bag:
                dist = levenshtein_distance(text, name)
                if dist < min_dist:
                    min_dist = dist
                    min_word = name

            text = min_word
            # print(text, min_dist)
        
        return [x, y, x+w, y+h, text]

    # Divide the image into blocks
    def create_blocks_array(self, image, blockNumber):
    
        # Set number of slices per axis
        axisSlice = int(math.sqrt(blockNumber))

        blocksArray = []
        # Split the image into vertical blocks
        split_h = np.array_split(image, axisSlice, axis = 0)
        
        for i in range(axisSlice):
            for j in range(axisSlice):
                # Split vertical blocks into square blocks
                split_hv = np.array_split(split_h[i], axisSlice, axis = 1)
                blocksArray.append(split_hv[j])
        return blocksArray

    # Compute the histogram of the image
    def create_histogram(self, block, mask, d_hist, mode, bins):
        channels = cv2.split(block)

        if mode == 'lab' and d_hist < 3: channels = channels[1:]
        
        range_a, range_b = 256, 256
        if mode == 'hsv': range_a = 180

        if d_hist == 1:
            if mask is None:
                # Compute 1D histograms for each channel separately
                hist = [cv2.calcHist([chan], [0], None, [bins], [0, range_a if i == 0 else range_b]) for i,chan in enumerate(channels)]
            else:
                # Compute 1D histograms for each channel separately
                hist = [cv2.calcHist([chan[mask!=0]], [0], None, [bins], [0, range_a if i == 0 else range_b]) for i,chan in enumerate(channels)]

        elif d_hist == 2:
            if mask is None:
                # Compute 2D joint histograms for each pair of channels
                hist = [cv2.calcHist([channels[i], channels[j]], [0, 1], None, [bins, bins], [0, range_a if i == 0 else range_b, 0, range_b])
                            for i in range(len(channels)) for j in range(i+1, len(channels))]
            else:
                # Compute 2D joint histograms for each pair of channels
                hist = [cv2.calcHist([channels[i][mask!=0], channels[j][mask!=0]], [0, 1], None, [bins, bins], [0, range_a if i == 0 else range_b, 0, range_b])
                            for i in range(len(channels)) for j in range(i+1, len(channels))]

        else:
            if mask is None:
                # Compute 3D joint histogram for all three channels
                hist, _ = np.histogramdd([c.flatten() for c in channels], bins=(bins, bins, bins), range=[(0, range_a), (0, range_b), (0, range_b)])
            else:
                # Compute 3D joint histogram for all three channels
                hist, _ = np.histogramdd([c[mask != 0] for c in channels], bins=(bins, bins, bins), range=[(0, range_a), (0, range_b), (0, range_b)])

        return hist
    
    # Compute the histogram of the image by blocks
    def get_features_by_blocks(self, image, level, mode, d_hist, bins, mask_text):

        # Get blocks using multi-level resolution
        blocksArray = []
        for lvl in range(level+1):
            for b in self.create_blocks_array(image, (2**lvl)*(2**lvl)):
                blocksArray.append(b)

        if mask_text is not None:
            blocksMasks = []

            # We create a mask image blocking the bbox of the text
            # That image will be used to compute the histogram of the image without the text
            mask_text_image = np.ones(image.shape[:2], dtype=np.uint8)
            assert len(mask_text_image.shape) == 2, 'Mask must be a grayscale image'

            mask_text_image[mask_text[1]:mask_text[3], mask_text[0]:mask_text[2]] = 0

            # It is necessary to create the blocks of the mask image too
            for lvl in range(level+1):
                for b in self.create_blocks_array(mask_text_image, (2**lvl)*(2**lvl)):
                    blocksMasks.append(b)
        else:
            blocksMasks = [None]*len(blocksArray)

        histograms = []
        for block, mask_text_block in zip(blocksArray, blocksMasks):
            # Compute the histogram of the channel and append it to the list
            hist = self.create_histogram(block, mask_text_block, d_hist, mode, bins)
            if isinstance(hist, list):
                for h in hist:
                    histograms.append(h.flatten() / (block.shape[0]*block.shape[1]))
            else:
                histograms.append(hist.flatten()  / (block.shape[0]*block.shape[1]))
            
        # Concatenate all histograms into a single feature vector
        return np.concatenate(histograms)

    def clean_noise(self, image, k):
        return cv2.medianBlur(image, k)
    
    # Load data, calculate background and text masks (if necessary) and compute features
    def load_data(self, level = 3, d_hist = 1, bins = 64, remove_background=False, remove_text=False, return_text=False, features_mode='color_features'):
        # Get a list of all image file names in the folder
        image_files = sorted(glob.glob(self.folder_path+'/*.jpg'))

        # Initialize an empty list to store the processed images and masks
        processed_features_rgb = dict()
        processed_features_hsv = dict()
        processed_features_lab = dict()
        masks, masks_text = [], []

        # Iterate over each image file
        for f in tqdm.tqdm(image_files):
            
            # Get the image id from the file name. Depending on the OS, the path separator is different
            try:
                img_id = int(f.split('\\')[-1].split('.')[0].split('_')[-1])
            except:
                img_id = int(f.split('/')[-1].split('.')[0].split('_')[-1])

            # Load the image
            image = cv2.imread(f)
            # Convert the image from BGR to lab color space
            image_lab = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
            # Convert the image from BGR to HSV color space
            image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

            image, image_lab, image_hsv = self.clean_noise(image, k=3), self.clean_noise(image_lab, k=3), self.clean_noise(image_hsv, k=3)

            # Remove background (there can be 2 paintings in the same image)
            if remove_background:
                mask_image, coordinates = self.get_mask(image, 'rgb')
               
                # Remove the text from each image
                if remove_text:
                    coordinates = sorted(coordinates, key=lambda x: (x[0], x[1]))
                    masks_text_i = [[None]]*len(coordinates)

                    # coordinates contains the coordinates of the paintings mask in the image
                    # We iterate over each masked painting and get the text mask for each one
                    # We hace to recover the original coordinates for the text mask
                    for i, (x,y,w,h) in enumerate(coordinates):
                        x_text, y_text, w_text, h_text, text = self.get_mask_text(image[y:y+h, x:x+w], return_text=return_text)
                        masks_text_i[i] = [x+x_text, y+y_text, x+x_text+w_text, y+y_text+h_text]
                    masks_text.append(masks_text_i)
                else:
                    for i in range(len(coordinates)):
                        masks_text.append([None])

            else:
                mask_image = None

                # if there is no background, the mask is the whole image
                coordinates = [[0,0,image.shape[1],image.shape[0]]] 
                if remove_text:
                    x_text, y_text, w_text, h_text, text = self.get_mask_text(image, return_text=return_text)
                    masks_text.append([[x_text, y_text, w_text, h_text]])
                else:
                    masks_text.append([None])

            masks.append(mask_image)
            
            features, features_hsv, features_lab = [], [], []
            for i in range(len(coordinates)):
                x,y,w,h = coordinates[i]

                relative_mask_text = None
                if masks_text[-1][i] is not None:
                    relative_mask_text = [masks_text[-1][i][0]-x, masks_text[-1][i][1]-y, masks_text[-1][i][2]-x, masks_text[-1][i][3]-y]

                if features_mode == 'color_features':    
                    # Get the features of every masked image
                    f = self.get_features_by_blocks(image[y:y+h, x:x+w], level, 'rgb', d_hist, bins, mask_text=relative_mask_text)
                    f_hsv = self.get_features_by_blocks(image_hsv[y:y+h, x:x+w], level, 'hsv', d_hist, bins, mask_text=relative_mask_text)
                    f_lab = self.get_features_by_blocks(image_lab[y:y+h, x:x+w], level, 'lab', d_hist, bins, mask_text=relative_mask_text)

                elif features_mode == 'texture_features':
                    ...
                
                elif features_mode == 'text_features':
                    f, f_hsv, f_lab = text, text, text

                features.append(f)
                features_hsv.append(f_hsv)
                features_lab.append(f_lab)

            # Append the features to the dict
            processed_features_rgb[img_id] = features
            processed_features_hsv[img_id] = features_hsv
            processed_features_lab[img_id] = features_lab
            
        return processed_features_rgb, processed_features_hsv, processed_features_lab, masks, masks_text
        

In [4]:
# Copied from https://github.com/benhamner/Metrics -> Metrics.Python.ml_metrics.average_precision.py
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# Copied from https://github.com/benhamner/Metrics -> Metrics.Python.ml_metrics.average_precision.py
def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    result = []
    for a,p in zip(actual, predicted):
        for a_i, p_i in zip(a,p):
            result.append(apk([a_i],p_i,k))
    return np.mean(result)

# compute the histogram intersection between two feature vectors
def histogram_intersection(hist1, hist2):
    return np.sum(np.minimum(hist1, hist2))

# compute the euclidian distance between two feature vectors
def euclidian_distance(hist1, hist2):
    return np.sqrt(np.sum(np.square(hist1 - hist2)))

# compute the chi-squared distance between two feature vectors
def chi_squared_distance(hist1, hist2):
    return np.sum(np.square(hist1 - hist2) / (hist1 + hist2 + 1e-10))

# compute the bhattacharyya distance between two feature vectors
def bhattacharyya_distance(hist1, hist2):
    # Ensure that both histograms have the same shape
    assert hist1.shape == hist2.shape, "Histograms must have the same shape"
    # Calculate the Bhattacharyya coefficient
    bhattacharyya_coeff = np.sum(np.sqrt(hist1 * hist2))
    # Calculate the Bhattacharyya distance
    bhattacharyya_distance = -np.log(bhattacharyya_coeff)
    return bhattacharyya_distance

# compute the Helling distance (Hellinger kernel) between two feature vectors
def hellinger_kernel(hist1, hist2):
    return np.sum(np.sqrt(hist1*hist2))

# based on the solution in https://stackoverflow.com/questions/25349178/calculating-percentage-of-bounding-box-overlap-for-image-detector-evaluation by @Martin Thoma
def get_iou(bb1, bb2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    bb1 : list ['x1', 'x2', 'y1', 'y2']
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    bb2 : list ['x1', 'x2', 'y1', 'y2']
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner

    Returns
    -------
    float
        in [0, 1]
    """

    # determine the coordinates of the intersection rectangle
    x_left = max(bb1[0], bb2[0])
    y_top = max(bb1[1], bb2[1])
    x_right = min(bb1[2], bb2[2])
    y_bottom = min(bb1[3], bb2[3])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1[2] - bb1[0]) * (bb1[3] - bb1[1])
    bb2_area = (bb2[2] - bb2[0]) * (bb2[3] - bb2[1])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

In [48]:
# Obtain the closest k DDBB image for query images determined by the similarity function. 
# The features have been previously calculated from the developed method.
# It returns a list of lists with the k closest images for each query image. 
def compare_images(query_features, bbdd_features, k, sim_func):
    
    result = []
    for id1,f1 in query_features.items():
        result_i = []
        for i,f_i in enumerate(f1):
            distances = []
            for id2,f2 in bbdd_features.items():
                distances.append((id2, sim_func(f_i,f2)))
                #get k smallest values from distances
                
            if sim_func in [euclidian_distance, chi_squared_distance, bhattacharyya_distance, levenshtein_distance]:
                k_smallest = sorted(distances, reverse=False, key=lambda x: x[1])[:k]
            else:
                k_smallest = sorted(distances, reverse=True, key=lambda x: x[1])[:k]
            result_i.append((id1, k_smallest))
            
        result.append(result_i)
        
    result2 = []
    for x in result:
        result2_i = []
        for y in x:
            result2_i.append([z[0] for z in y[1]])
        result2.append(result2_i)
    
    return result2

# Calculate Intersection over Union (IoU) for each query image and plot the results (if necessary).
def calculate_iou(gt_bboxes, pred_bboxes, do_plot=False):
    plot_rows, plot_cols = 4, 4

    # Plot config
    if do_plot:
        fig, ax = plt.subplots(nrows=plot_rows, ncols=plot_cols, figsize=(20, 20))

    # Get mean IoU score
    results = []
    for i, (q_gt, q_pred) in enumerate(zip(gt_bboxes, pred_bboxes)):
        for bbox_gt, bbox_pred in zip(q_gt, q_pred):
            bbox_gt_new = [bbox_gt[0][0], bbox_gt[0][1], bbox_gt[2][0], bbox_gt[2][1]]
            results.append(get_iou(bbox_pred, bbox_gt_new))

        if i < plot_rows*plot_cols and do_plot:
            im = cv2.imread(f"data/qsd1_w2/{str(i).zfill(5)}.jpg")
            plt_idx = np.unravel_index(i, (plot_rows, plot_cols))
            ax[plt_idx].imshow(im)
            ax[plt_idx].add_patch(plt.Rectangle(
                (bbox_gt_new[0], bbox_gt_new[1]), 
                bbox_gt_new[2]- bbox_gt_new[0],
                bbox_gt_new[3]- bbox_gt_new[1],
                edgecolor="green", facecolor="none", lw=2))
            ax[plt_idx].add_patch(plt.Rectangle(
                (bbox_pred[0], bbox_pred[1]), 
                bbox_pred[2]- bbox_pred[0],
                bbox_pred[3]- bbox_pred[1],
                edgecolor="red", facecolor="none", lw=2))
    if do_plot: plt.show()

    return np.mean(results)

In [44]:
# Create DataLoader objects for both the database and the queries
data_loader = DataLoader('data/BBDD')
data_loader_qsd1_w3 = DataLoader('data/qsd1_w3')
data_loader_qsd2_w3 = DataLoader('data/qsd2_w3')

# Load ground truth files for each query
with open('data/qsd1_w3/gt_corresps.pkl', 'rb') as f:
    gt_w3_1 = pickle.load(f)

with open('data/qsd2_w3/gt_corresps.pkl', 'rb') as f:
    gt_w3_2 = pickle.load(f)

# Load ground truths for the text bboxes in qsd1_w3
with open('data/qsd1_w3/text_boxes.pkl', 'rb') as f:
    bboxes_gt_d1w3 = pickle.load(f)

# Load ground truths for the text bboxes in qsd2_w3
with open('data/qsd2_w3/text_boxes.pkl', 'rb') as f:
    bboxes_gt_d2w3 = pickle.load(f)

## Validation results

### Task 2:

In [51]:
# Define best hyperparameters configuration
features_mode = 'text_features'
k = 5

# Calculate and store the features
bbdd_text = dict()
for i, text_file in enumerate(glob.glob(f'data/BBDD/*.txt')):
    # read text file
    with open(text_file, 'r') as f:
        line = f.readlines()
    
    for l in line:
        if re.search(r"\('([^']+)'", l.split(',')[0]):
            author = re.search(r"\('([^']+)'", l.split(',')[0]).group(1)
            bbdd_text[i] = author

query_text, _ , _, _, _ = data_loader_qsd1_w3.load_data(remove_text=True, return_text=True, features_mode=features_mode)

results = compare_images(query_text, bbdd_text, k, levenshtein_distance)
mapk_1 = mapk(gt_w3_1, results, k)
print(f'MAP@{k} for qsd1_w3: {mapk_1}')    
        

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:16<00:00,  1.80it/s]

MAP@5 for qsd1_w3: 0.46222222222222226





In [79]:
# THE SYSTEM IS ALMOST PERFECT BUT ONLY WITH THE AUTHOR NAME THERE IS AMBIGUITY
lines = []
for i in gt_w3_1:
    with open(f'data/BBDD/bbdd_{str(i[0]).zfill(5)}.txt', 'r') as f:
        line = f.readlines()
        lines.append(line[0].split(',')[0])

for w1,w2 in zip(lines, query_text.values()):
    print(w2, '->', w1)

['Modest Cuixart'] -> ('Modest Cuixart'
['Joan Ponc'] -> ('Joan Ponc'
['Per Krohg'] -> ('Per Krohg'
['Pere Santilari'] -> ('Pere Santilari'
['Unknown'] -> ('Antoni Clave'
['Agusti Puig'] -> ('Agusti Puig'
['Edvard Munch'] -> ('Edvard Munch'
['Jose M. Codina'] -> ('Jose M. Codina'
['Qi Hao'] -> ('Joan Hernandez Pijuan'
['Alfred Figueras'] -> ('Alfred Figueras'
['Francesc Artigau'] -> ('Francesc Artigau'
['Gerard Sala'] -> ('Gerard Sala'
['Antoni Llena'] -> ('Antoni Llena'
['Pere Santilari'] -> ('Pere Santilari'
['Joan Ponc'] -> ('Joan Ponc'
['Josep Guinovart'] -> ('Josep Guinovart'
['Joan Ponc'] -> ('Joan Ponc'
['Anders Svarstad'] -> ('Anders Svarstad'
['Qi Hao'] -> ('Per Krohg'
['Joan Pere Viladecans'] -> ('Joan Pere Viladecans'
['Qi Hao'] -> ('Leticia Feduchi'
['Qi Hao'] -> ('Sergi Barnils'
['Xevi Vilaro'] -> ('Xevi Vilaro'
['Yago Hortal'] -> ('Yago Hortal'
['Sergi Barnils'] -> ('Sergi Barnils'
['Joan Pere Viladecans'] -> ('Joan Pere Viladecans'
['Josep Cisquella'] -> ('Josep Cisquell

In [81]:
k = 5
features_mode = 'color_features'
# Compute features for the database and the query images
features_rgb, features_hsv, features_lab, _, _ = data_loader.load_data(features_mode=features_mode, remove_background=False, level=2, d_hist=2, bins=8)
features_rgb_q1_w2, features_hsv_q1_w2, features_lab_q1_w2, _, _ = data_loader_qsd1_w3.load_data(features_mode=features_mode, remove_background=False, remove_text=True, level=2, d_hist=2, bins=8)

# Query 1: Results and mAP@k
for sim_func in [chi_squared_distance, histogram_intersection]:

    results_rgb_q1_w3 = compare_images(features_rgb_q1_w2, features_rgb, k, sim_func)
    results_hsv_q1_w3 = compare_images(features_hsv_q1_w2, features_hsv, k, sim_func)
    results_lab_q1_w3 = compare_images(features_lab_q1_w2, features_lab, k, sim_func)

    mapk_rgb_1 = mapk(gt_w3_1, results_rgb_q1_w3, k)
    mapk_hsv_1 = mapk(gt_w3_1, results_hsv_q1_w3, k)
    mapk_lab_1 = mapk(gt_w3_1, results_lab_q1_w3, k)

    print(f'RGB, {sim_func.__name__} = \tmAP@1: {mapk_rgb_1}')
    print(f'HSV, {sim_func.__name__} = \tmAP@1: {mapk_hsv_1}')
    print(f'LAB, {sim_func.__name__} = \tmAP@1: {mapk_lab_1}')

RGB, chi_squared_distance = 	mAP@1: 0.7777777777777778
HSV, chi_squared_distance = 	mAP@1: 0.7833333333333333
LAB, chi_squared_distance = 	mAP@1: 0.525
RGB, histogram_intersection = 	mAP@1: 0.7583333333333333
HSV, histogram_intersection = 	mAP@1: 0.7916666666666666
LAB, histogram_intersection = 	mAP@1: 0.4888888888888889


### Task 3:

- Grayscale image
- [Closing - opening] filter, both using 9x9 structuring element
- Binarization 
- Two dilations using 13x13 structuring element
- Bounding boxes extraction using _findContours_
- Biggest contour represents one of the text words. We add the bounding boxes with similar height located on the right or left to the main one (if necessary).

In [8]:
# Create text bounding boxes
_, _, _, _, masks_text = data_loader_qsd1_w2.load_data(remove_background=False, remove_text=True, level=level, d_hist=d_hist, bins=bins)

100%|██████████| 30/30 [00:01<00:00, 17.18it/s]


### Task 4:

In [9]:
iou = calculate_iou(bboxes_gt_d1w2, masks_text)
        
print("Mean IoU:", iou)

Mean IoU: 0.45547641682825424


### Task 5:

In [10]:
# Compute features for the database and the query images
features_rgb, features_hsv, features_lab, _, _ = data_loader.load_data(remove_background=False, level=level, d_hist=d_hist, bins=bins)
features_rgb_q1_w2, features_hsv_q1_w2, features_lab_q1_w2, _, _ = data_loader_qsd1_w2.load_data(remove_background=False, remove_text=True, level=level, d_hist=d_hist, bins=bins)

# Query 1: Results and mAP@k
for sim_func in [chi_squared_distance, histogram_intersection]:

    results_rgb_q1_w2 = compare_images(features_rgb_q1_w2, features_rgb, k, sim_func)
    results_hsv_q1_w2 = compare_images(features_hsv_q1_w2, features_hsv, k, sim_func)
    results_lab_q1_w2 = compare_images(features_lab_q1_w2, features_lab, k, sim_func)

    mapk_rgb_1 = mapk(gt_w2_1, results_rgb_q1_w2, k)
    mapk_hsv_1 = mapk(gt_w2_1, results_hsv_q1_w2, k)
    mapk_lab_1 = mapk(gt_w2_1, results_lab_q1_w2, k)

    print(f'RGB, {sim_func.__name__} = \tmAP@1: {mapk_rgb_1}')
    print(f'HSV, {sim_func.__name__} = \tmAP@1: {mapk_hsv_1}')
    print(f'LAB, {sim_func.__name__} = \tmAP@1: {mapk_lab_1}')



100%|██████████| 287/287 [00:28<00:00, 10.11it/s]
100%|██████████| 30/30 [00:01<00:00, 16.81it/s]


RGB, chi_squared_distance = 	mAP@1: 0.4666666666666667
HSV, chi_squared_distance = 	mAP@1: 0.5
LAB, chi_squared_distance = 	mAP@1: 0.4
RGB, histogram_intersection = 	mAP@1: 0.4666666666666667
HSV, histogram_intersection = 	mAP@1: 0.4666666666666667
LAB, histogram_intersection = 	mAP@1: 0.36666666666666664


### Task 6:

- Grayscale image
- Apply gaussian blur
- Binarization using intelligent threshold based on local pixel neighborhood (11x11 block size)
- Twice vertical and horizontal dilations, 5x1 and 1x5 structuring element respectively
- Bounding boxes straction using _findContours_

In [13]:
# Compute features for the database and the query images
features_rgb, features_hsv, features_lab, _, _ = data_loader.load_data(remove_background=False, level=level, d_hist=d_hist, bins=bins)
features_rgb_q2_w2, features_hsv_q2_w2, features_lab_q2_w2, _, _ = data_loader_qsd2_w2.load_data(remove_background=True, remove_text=True, level=level, d_hist=d_hist, bins=bins)

# Query 1: Results and mAP@k
for sim_func in [chi_squared_distance, histogram_intersection]:

    results_rgb_q1_w2 = compare_images(features_rgb_q2_w2, features_rgb, k, chi_squared_distance)
    results_hsv_q1_w2 = compare_images(features_hsv_q2_w2, features_hsv, k, chi_squared_distance)
    results_lab_q1_w2 = compare_images(features_lab_q2_w2, features_lab, k, chi_squared_distance)

    mapk_rgb_1 = mapk(gt_w2_2, results_rgb_q1_w2, k)
    mapk_hsv_1 = mapk(gt_w2_2, results_hsv_q1_w2, k)
    mapk_lab_1 = mapk(gt_w2_2, results_lab_q1_w2, k)

    print(f'RGB, {sim_func.__name__} = \tmAP@1: {mapk_rgb_1}')
    print(f'HSV, {sim_func.__name__} = \tmAP@1: {mapk_hsv_1}')
    print(f'LAB, {sim_func.__name__} = \tmAP@1: {mapk_lab_1}')

100%|██████████| 287/287 [00:28<00:00, 10.00it/s]
100%|██████████| 30/30 [00:05<00:00,  5.80it/s]


RGB, chi_squared_distance = 	mAP@1: 0.5740740740740741
HSV, chi_squared_distance = 	mAP@1: 0.5740740740740741
LAB, chi_squared_distance = 	mAP@1: 0.4074074074074074
RGB, histogram_intersection = 	mAP@1: 0.5740740740740741
HSV, histogram_intersection = 	mAP@1: 0.5740740740740741
LAB, histogram_intersection = 	mAP@1: 0.4074074074074074
