# C1 W2 Group 8

In [None]:
from matplotlib import pyplot as plt
import numpy as np
from PIL import Image, ImageDraw
import pandas as pd

from src.data import GT_QSD1_W1_LIST
from src.paths import BBDD_PATH, WEEK_2_RESULTS_PATH, QSD1_W1_PATH
from src.descriptors import GreyScaleHistogramDescriptor1D, ColorHistogramDescriptor1D, MultiColorSpaceHistogramDescriptor1D, ColorHistogramDescriptor3D
from src.similarities import MSE, L1Distance, ChiSquaredDistance, HistogramIntersection, HellingerKernel, Bhattacharyya
from src.metrics import MeanAveragePrecisionAtK
from tqdm import tqdm
import datetime


## Task 1 - Implement 3D / 2D and block and hierarchical histograms

### Data processing : Image loading, partitioning & descriptor computation

First, we compute the partitions of the images, and the corresponding descriptors

In [None]:
database_image_PIL_list = [Image.open(db_img_path) for db_img_path in sorted(BBDD_PATH.glob("*.jpg"))]  # Load once
for idx, db_img in enumerate(database_image_PIL_list):
    assert db_img.filename.endswith(f"{idx}.jpg")

In [None]:
query_image_PIL_list = [Image.open(query_img_path) for query_img_path in sorted(QSD1_W1_PATH.glob("*.jpg"))]  # Load once
for idx, query_img in enumerate(query_image_PIL_list):
    assert query_img.filename.endswith(f"{idx}.jpg")

In [None]:
def partition_image(image: Image.Image, N: int):
    img_width, img_height = image.size
    
    # Compute info about the partition
    rows = cols = int(N)  # Assuming N is a perfect square
    part_width = img_width // cols
    part_height = img_height // rows
    
    partitions = []
    
    # Crop each partition
    for row in range(rows):
        for col in range(cols):
            left = col * part_width
            top = row * part_height
            right = left + part_width
            bottom = top + part_height
            part = image.crop((left, top, right, bottom))
            partitions.append(part)
    
    return partitions

def plot_partitions(image, N):
    img_width, img_height = image.size
    
    # Compute info about the partition
    rows = cols = int(N)  # Assuming N is a perfect square
    part_width = img_width // cols
    part_height = img_height // rows
    
    draw = ImageDraw.Draw(image)
    
    # Vertical partition lines
    for i in range(1, cols):
        x = i * part_width
        draw.line([(x, 0), (x, img_height)], fill="white", width=5)  
    
    # Horizontal partition lines
    for i in range(1, rows):
        y = i * part_height
        draw.line([(0, y), (img_width, y)], fill="white", width=5)  
    
    return image

In [None]:
partitioned_images_db = {}
partition_levels = [1,2,3]

To make the execution faster we persist the partitions of the images for the next runs of the notebook.

In [None]:
for partition_level in partition_levels:
    partition_level_dir = WEEK_2_RESULTS_PATH / f"partitioned_db_images_level_{partition_level}"
    
    # If we have previously executed the code, the partitions are stored in the disk and we directly load them
    if partition_level_dir.exists():
        partitioned_images_db[partition_level] = []

        for img_idx in tqdm(range(len(database_image_PIL_list)), desc=f"Loading images at level {partition_level}"):
            partitions = []
            block_idx = 0
            while True:
                img_path = partition_level_dir / f"img_{img_idx}_block_{block_idx}.jpg"
                if not img_path.exists():
                    break  
                partitions.append(Image.open(img_path))
                block_idx += 1

            partitioned_images_db[partition_level].append(partitions)

        continue
    
    # Otherwise we compute and persist the partitions for next executions
    partition_level_dir.mkdir(parents=True, exist_ok=True)

    if partition_level == 1:
        print("Partitioning at level 1")
        partitioned_images_db[partition_level] = [[img] for img in database_image_PIL_list]
    else:
        partitioned_images_db[partition_level] = [
            partition_image(img, partition_level) 
            for img in tqdm(database_image_PIL_list, desc=f"Partitioning at level {partition_level}")
        ]
    
    for img_idx, partitions in tqdm(enumerate(partitioned_images_db[partition_level]), 
                                    total=len(partitioned_images_db[partition_level]), 
                                    desc=f"Saving images at level {partition_level}"):
        for block_idx, block_img in enumerate(partitions):
            block_img.save(partition_level_dir / f"img_{img_idx}_block_{block_idx}.jpg")


In [None]:
descriptors = [
    MultiColorSpaceHistogramDescriptor1D(['HSV', 'LAB', 'YCbCr'], histogram_type='log-chromatic'),
    #ColorHistogramDescriptor3D("RGB", 9, histogram_type='log-chromatic'),
    ColorHistogramDescriptor3D("HSV", 9, histogram_type='log-chromatic'),
    #ColorHistogramDescriptor3D("YCbCr", 9, histogram_type='log-chromatic'),
    #ColorHistogramDescriptor3D("LAB", 9, histogram_type='log-chromatic'),
]

partitioned_histograms_db = {}

In [None]:

for descriptor in descriptors:
    print("Descriptor: ", descriptor.name)
    partitioned_histograms_db[descriptor.name] = {}

    for partition_level in partition_levels:
        print("Partition Level: ", partition_level)
        partitioned_histograms_db[descriptor.name][partition_level] = []

        for partitions in tqdm(partitioned_images_db[partition_level]):
            histograms_img = []
            for partition_img in partitions:
                histogram_partition = descriptor.compute(partition_img)
                histograms_img.append(histogram_partition)


            concatenated_histogram = np.concatenate(histograms_img, axis=0)
                
            partitioned_histograms_db[descriptor.name][partition_level].append(concatenated_histogram)


Now we perform a similar process for the queries

In [None]:
partitioned_images_query = {}

for partition_level in partition_levels:
    partition_level_dir = WEEK_2_RESULTS_PATH / f"partitioned_query_images_level_{partition_level}"
    
    # If we have previously executed the code, the partitions are stored in the disk and we directly load them
    if partition_level_dir.exists():
        partitioned_images_query[partition_level] = []

        for img_idx in tqdm(range(len(query_image_PIL_list)), desc=f"Loading images at level {partition_level}"):
            partitions = []
            block_idx = 0
            while True:
                img_path = partition_level_dir / f"img_{img_idx}_block_{block_idx}.jpg"
                if not img_path.exists():
                    break  
                partitions.append(Image.open(img_path))
                block_idx += 1

            partitioned_images_query[partition_level].append(partitions)

        continue
    
    # Otherwise we compute and persist the partitions for next executions
    partition_level_dir.mkdir(parents=True, exist_ok=True)

    if partition_level == 1:
        print("Partitioning at level 1")
        partitioned_images_query[partition_level] = [[img] for img in query_image_PIL_list]
    else:
        partitioned_images_query[partition_level] = [
            partition_image(img, partition_level) 
            for img in tqdm(query_image_PIL_list, desc=f"Partitioning at level {partition_level}")
        ]
    
    for img_idx, partitions in tqdm(enumerate(partitioned_images_query[partition_level]), 
                                    total=len(partitioned_images_query[partition_level]), 
                                    desc=f"Saving images at level {partition_level}"):
        for block_idx, block_img in enumerate(partitions):
            block_img.save(partition_level_dir / f"img_{img_idx}_block_{block_idx}.jpg")

In [None]:
partitioned_histograms_query = {}

for descriptor in descriptors:
    print("Descriptor: ", descriptor.name)
    partitioned_histograms_query[descriptor.name] = {}

    for partition_level in partition_levels:
        print("Partition Level: ", partition_level)
        partitioned_histograms_query[descriptor.name][partition_level] = []

        for partitions in tqdm(partitioned_images_query[partition_level]):
            histograms_img = []
            for partition_img in partitions:
                histogram_partition = descriptor.compute(partition_img)
                histograms_img.append(histogram_partition)

            concatenated_histogram = np.concatenate(histograms_img, axis=0)
            partitioned_histograms_query[descriptor.name][partition_level].append(concatenated_histogram)

### Concatentation strategies

Now we will concatenate the histograms for both cases, the block-based (BB) and the spatial pyramid representation (SPR). We only do that if the histogram has one dimension.

In [None]:
histograms_SPR_DB = {}
histograms_SPR_Q = {}
histograms_BB_DB = {}
histograms_BB_Q = {}

for db_data, query_data in zip(partitioned_histograms_db.items(), partitioned_histograms_query.items()):
    descriptor_db = db_data[0]
    partitions_db = db_data[1]
    descriptor_q = query_data[0]
    partitions_q = query_data[1]
    assert descriptor_db == descriptor_q

    histograms_BB_DB[descriptor_db] = partitioned_histograms_db[descriptor_db]
    histograms_BB_Q[descriptor_db] = partitioned_histograms_query[descriptor_db]

    histograms_SPR_DB[descriptor_db] = []
    for img_idx in range(len(database_image_PIL_list)):
        image_histograms = []
        for partition_level in partition_levels:
            image_histograms.append(partitioned_histograms_db[descriptor_db][partition_level][img_idx])

        concatenated = np.concatenate(image_histograms, axis=0)
        histograms_SPR_DB[descriptor_db].append(concatenated)


    histograms_SPR_Q[descriptor_db] = []
    for img_idx in range(len(query_image_PIL_list)):
        image_histograms = []
        for partition_level in partition_levels:
            image_histograms.append(partitioned_histograms_query[descriptor_db][partition_level][img_idx])

        concatenated = np.concatenate(image_histograms, axis=0)
        histograms_SPR_Q[descriptor_db].append(concatenated)


### Similarity computations

In [None]:
similarity_classes = [
    #MSE(),
    #L1Distance(),
    #ChiSquaredDistance(),
    HistogramIntersection(),
    HellingerKernel(),
    Bhattacharyya()
]

In [None]:
query_descriptor_distances_to_db_list = {}

for similarity in similarity_classes:
    similarity_name = similarity.__class__.__name__
    query_descriptor_distances_to_db_list[similarity_name] = {}
    for descriptor in descriptors:
        descriptor_name = descriptor.name
        print(f"- {similarity_name} & {descriptor_name}")
        query_descriptor_distances_to_db_list[similarity_name][descriptor_name] = {
            "BB": {}, # We will compute one similarity per partition level
            "SPR": [] # We will compute one similarity per image
        }
        query_descriptors = np.array(histograms_SPR_Q[descriptor_name])
        database_descriptors = np.array(histograms_SPR_DB[descriptor_name])

        # COMPUTE SIMILARITIES
        spr_similarity = similarity.compute(query_descriptors, database_descriptors)
        query_descriptor_distances_to_db_list[similarity_name][descriptor_name]["SPR"] = spr_similarity

        for partition_level in partition_levels:
            partitioned_database_descriptors = np.array(histograms_BB_DB[descriptor_name][partition_level])
            partitioned_query_descriptors = np.array(histograms_BB_Q[descriptor_name][partition_level])
            bb_similarity = similarity.compute(partitioned_query_descriptors, partitioned_database_descriptors)
            query_descriptor_distances_to_db_list[similarity_name][descriptor_name]["BB"][partition_level] = bb_similarity

In [None]:
def plot_distance_matrix(distance_matrix, title):
    plt.figure(figsize=(10, 8))
    plt.imshow(distance_matrix, cmap='viridis', aspect='auto')
    plt.colorbar()
    plt.title(title)
    plt.xlabel('Database Images')
    plt.ylabel('Query Images')
    plt.show()

In [None]:
for similarity_name, descriptors_dict in query_descriptor_distances_to_db_list.items():
    for descriptor_name, data_dict in descriptors_dict.items():
        # Plot the SPR distance matrix
        spr_similarity = data_dict["SPR"]
        plot_distance_matrix(spr_similarity, f"{similarity_name} - {descriptor_name} - SPR Distance Matrix")
        
        # Plot the BB distance matrices for all partition levels
        bb_similarity = data_dict["BB"]
        for partition_level, distances in bb_similarity.items():
            plot_distance_matrix(distances, f"{similarity_name} - {descriptor_name} - BB Distance Matrix (Partition Level {partition_level})")

### Retrieval

In [None]:
def get_topk_distances(query_distances_to_bbdd: np.array, k: int = 1) -> tuple[list[list], list[list]]:
    retrieved_bbdd_indices = np.argsort(query_distances_to_bbdd, axis=1)[:, :k]
    
    retrieved_bbdd_similarity = np.take_along_axis(query_distances_to_bbdd, retrieved_bbdd_indices, axis=1)
    
    return retrieved_bbdd_indices.tolist(), retrieved_bbdd_similarity.tolist()

In [None]:
# Define k (number of top results to retrieve)
k = 5

retrieved_db = {

}

for similarity_name, descriptors_dict in query_descriptor_distances_to_db_list.items():
    retrieved_db[similarity_name] = {}
    for descriptor_name, data_dict in descriptors_dict.items():
        print(similarity_name, descriptor_name)
        retrieved_db[similarity_name][descriptor_name] = {
            "SPR": {},
            "BB": {},
        }

        # SPR Top-k retrieval
        spr_similarity = data_dict["SPR"]
        topk_indices_spr, topk_similarities_spr = get_topk_distances(spr_similarity, k)
        retrieved_db[similarity_name][descriptor_name]["SPR"]["indexes"] = topk_indices_spr
        retrieved_db[similarity_name][descriptor_name]["SPR"]["similarities"] = topk_similarities_spr


        print(f"Top-{k} for {similarity_name} - {descriptor_name} (SPR):")
        print(f"Indices: {topk_indices_spr}\n\n")

        # BB Top-k retrieval for each partition level
        bb_similarity = data_dict["BB"]
        for partition_level, distances in bb_similarity.items():
            retrieved_db[similarity_name][descriptor_name]["BB"][partition_level] = {}
            topk_indices_bb, topk_similarities_bb = get_topk_distances(distances, k)
            retrieved_db[similarity_name][descriptor_name]["BB"][partition_level]["indexes"] = topk_indices_bb
            retrieved_db[similarity_name][descriptor_name]["BB"][partition_level]["similarities"] = topk_similarities_bb
            print(f"Top-{k} for {similarity_name} - {descriptor_name} (BB Level {partition_level}):")
            print(f"Indices: {topk_indices_bb}\n\n")


In [None]:
metrics = [MeanAveragePrecisionAtK()]
K = [1,5]

In [None]:
results = []

for i, k in enumerate(K):
    for metric in metrics:
        for similarity in similarity_classes:
            similarity_name = similarity.__class__.__name__
            for descriptor in descriptors:
                descriptor_name = descriptor.name

                # SPR
                indexes_retrieved = retrieved_db[similarity_name][descriptor_name]["SPR"]["indexes"]
                map_val = round(metric.compute(GT_QSD1_W1_LIST, indexes_retrieved, k), 2)
                results.append({
                    "K": k,
                    "Metric": metric.__class__.__name__,
                    "Descriptor": descriptor_name,
                    "Similarity": similarity_name,
                    "Method": f"SPR",
                    "Result": map_val,
                    "Indices": indexes_retrieved,
                })

                # BB
                for partition_level in partition_levels:
                    indexes_retrieved = retrieved_db[similarity_name][descriptor_name]["BB"][partition_level]["indexes"]
                    map_val = round(metric.compute(GT_QSD1_W1_LIST, indexes_retrieved, k), 2)
                    results.append({
                        "K": k,
                        "Metric": metric.__class__.__name__,
                        "Descriptor": descriptor_name,
                        "Similarity": similarity_name,
                        "Method": f"BB at level {partition_level}",
                        "Result": map_val,
                        "Indices": indexes_retrieved,
                    })


# Convert the results into a DataFrame for easy analysis
results_df = pd.DataFrame(results)

# Optionally, drop columns you don’t need in the final analysis
results_df_cleaned = results_df.drop(columns=["Indices", "Descriptor_id", "Similarity_id"], errors='ignore')

# Output the DataFrame
results_df_cleaned

In [None]:
results_df_cleaned.to_csv(f"out_{datetime.datetime.now()}.csv")