In [1]:
from qdrant_client import QdrantClient, models
from qdrant_client.models import PointStruct
from sklearn.metrics import ndcg_score
import re
import numpy as np
import pandas as pd
import json
import uuid
import itertools
import time
import os
import psutil

## Loading embeddings

In [2]:
EMBEDDINGS_FILE = "dino_small_embeddings.txt"

In [3]:
def read_embeddings_txt(path: str, outfits: bool = True):
    """
    Reads embeddings from a .txt file into a Python list of dictionaries.
    There are two modes for parsing the data: one for outfit embeddings
    (which include `outfit_id` and `cloth_id`), and another for golden_set
    embeddings (which only have `cloth_id`). This separation is necessary
    due to differences in the file format's first column (identifier).

    Args:
        path (str): The file path to the embeddings `.txt` file.
                    Expected format for `outfits=True`: "cloth<outfit_id>_<cloth_id> <embedding_values>..."
                    Expected format for `outfits=False`: "<cloth_id>.jpg <embedding_values>..."
        outfits (bool): If True, the function expects and parses the 'outfits' format,
                        extracting both `outfit_id` and `cloth_id`.
                        If False, it expects the 'golden_set' format, extracting only `cloth_id`.

    Returns:
        list[dict]: A list of dictionaries, where each dictionary represents an embedding entry.
                    Each dictionary will contain:
                    - "outfit_id" (str): Extracted outfit ID (only if `outfits` is True).
                    - "cloth_id" (str): Extracted cloth ID.
                    - "embeddings" (list[float]): The numerical embedding vector.
    """
    embeddings_list = [] # Initialize an empty list to store the parsed embedding dictionaries

    if outfits:
        # Mode for reading outfit embeddings (expected format: "cloth<outfit_id>_<cloth_id> ...")
        with open(path, "r") as f:
            lines = f.readlines() # Read all lines from the file
            # Strip whitespace from each line and split by space.
            # This separates the identifier string from the embedding values.
            lines = [line.strip().split(" ") for line in lines]
            
            # Iterate through each processed line (which is now a list of strings)
            for line in lines:
                # Extract outfit_id using regex: looks for digits after "cloth"
                # Example: "cloth123_abc" -> outfit_id "123"
                outfit_id_match = re.search(r"cloth(\d+)", line[0])
                outfit_id = outfit_id_match[1] if outfit_id_match else None

                # Extract cloth_id using regex: looks for alphanumeric characters after "cloth<digits>_"
                # Example: "cloth123_abc" -> cloth_id "abc"
                cloth_id_match = re.search(r"cloth\d+_([\d\w]+)", line[0])
                cloth_id = cloth_id_match[1] if cloth_id_match else None
                
                # Append the parsed data as a dictionary to the list
                embeddings_list.append({
                    "outfit_id" : outfit_id,
                    "cloth_id" : cloth_id,
                    "embeddings" : [float(val) for val in line[1:]] # Convert all subsequent values to floats for the embedding vector
                })
        return embeddings_list
    else:
        # Mode for reading golden_set embeddings (expected format: "<cloth_id>.jpg ...")
        with open(path, "r") as f:
            lines = f.readlines() # Read all lines from the file
            # Strip whitespace from each line and split by space.
            lines = [line.strip().split(" ") for line in lines]
        
            # Iterate through each processed line
            for line in lines:
                # Extract the name (which serves as cloth_id) by removing the ".jpg" extension
                # Example: "image123.jpg" -> name "image123"
                name = line[0][:-4] 
                
                # Append the parsed data as a dictionary to the list
                embeddings_list.append({
                    "cloth_id" : name,
                    "embeddings" : [float(val) for val in line[1:]] # Convert embedding values to floats
                })
        return embeddings_list

In [4]:
embeddings_dict = read_embeddings_txt(EMBEDDINGS_FILE)

In [5]:
vector_size = len(embeddings_dict[0]["embeddings"])
vector_size

384

# Connecting to Qdrant Client & Initial Configuration

In [6]:
QDRANT_CLOUD_URL = "https://b0da717c-1acf-4983-8bda-2c5214327161.eu-west-2-0.aws.cloud.qdrant.io:6333"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.aaqOk2k8NhxTemd3DPe8jwFaeYv4Xb6CX3CH2IZ40ts"
COLLECTION_NAME = "outfit"

In [7]:
client = QdrantClient(
    url="http://localhost:6333"
)

In [8]:
hnsw_params = models.HnswConfigDiff(
    m = 4,
    ef_construct = 20
)

In [9]:
if client.collection_exists(collection_name = COLLECTION_NAME):
    client.delete_collection(collection_name = COLLECTION_NAME)

In [10]:
client.create_collection(
    collection_name = COLLECTION_NAME,
        vectors_config = models.VectorParams(
            size = vector_size,
            distance = models.Distance.COSINE
        ),
        hnsw_config = models.HnswConfigDiff(
            m = 8,
            ef_construct = 32
        ),
        quantization_config=models.ScalarQuantization(
            scalar=models.ScalarQuantizationConfig(
                type=models.ScalarType.INT8,
                quantile=0.95,
                always_ram=True,
            ),
        ),
    )

True

In [11]:
client.get_collection(collection_name=COLLECTION_NAME)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=0, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=8, ef_construct=32, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantiz

In [12]:
def upload_embeddings_to_database(embeddings_dict, max_items: int = None, batch: int = 128):
    """
    Uploads a list of embeddings (points) to a specified Qdrant collection in batches.
    It provides real-time progress and estimated time remaining during the upload process.

    Args:
        embeddings_dict (list[dict]): A list of dictionaries, where each dictionary
                                     contains at least "embeddings" (the vector)
                                     and "outfit_id" (for the payload).
                                     Expected format: [{"outfit_id": "...", "embeddings": [...]}, ...]
        max_items (int, optional): The maximum number of embeddings to upload from the list.
                                   If None, all embeddings in `embeddings_dict` will be uploaded.
                                   Defaults to None.
        batch (int, optional): The number of embeddings to include in each batch upload request
                               to Qdrant. Larger batches can be more efficient but consume
                               more memory temporarily. Defaults to 128.

    Returns:
        None: The function performs the upload operation and prints progress.
              It does not return any value.
    """
    # Determine the total number of items to upload.
    # If max_items is not specified, upload all available embeddings.
    if max_items is None:
        max_items = len(embeddings_dict)
    
    # Initialize a list to store the time taken for each upload cycle (batch).
    # This is used to calculate the estimated time remaining.
    time_for_one_cycle = []
    
    # Calculate the total number of upload cycles (batches) needed.
    # Note: Integer division might truncate, but the range loop handles this.
    number_of_cycles = max_items / batch 
    
    # Iterate through the embeddings_dict in steps of 'batch' size.
    # 'i' represents the starting index of the current batch.
    for i in range(0, max_items, batch):
        t1 = time.time() # Record the start time of the current batch upload
        
        # Prepare the list of PointStruct objects for the current batch.
        # Each PointStruct requires a unique 'id', the 'vector' (embedding),
        # and an optional 'payload' (metadata).
        points_to_upsert = [
            PointStruct(
                # Generate a unique UUID for each point's ID.
                # Converting to string is necessary as Qdrant IDs are strings or integers.
                id=str(uuid.uuid4()), 
                # Assign the embedding vector to the 'vector' field.
                vector=item["embeddings"],
                # Store relevant metadata in the 'payload'.
                # Here, 'outfit_id' is stored for later retrieval/filtering.
                payload={
                    "outfit_id": item["outfit_id"]
                }
            )
            # Slice the embeddings_dict to get the current batch of items.
            # enumerate is used here but 'i' is the loop variable, consider if this 'i' is truly needed.
            # The enumerate `i` in the list comprehension is actually local to the list comprehension and unused.
            # It should just be `for item in embeddings_dict[i:i + batch]`.
            for item in embeddings_dict[i:i + batch] 
        ]
        
        # Send the batch of points to Qdrant for upsertion (insert or update).
        # 'COLLECTION_NAME' must be defined and point to your target Qdrant collection.
        # 'wait=True' ensures that the function call blocks until Qdrant confirms the operation
        # has been completed for this batch, which is useful for reliable uploads and timing.
        operation_info = client.upsert(
            collection_name=COLLECTION_NAME,
            wait=True,
            points=points_to_upsert
        )
        
        t2 = time.time() # Record the end time of the current batch upload
        dt = round(t2 - t1, 2) # Calculate the elapsed time for this batch
        time_for_one_cycle.append(dt) # Store the time for calculating estimated remaining time
        
        # Print progress and estimated time remaining.
        # The '\r' at the end ensures the line is overwritten in the console,
        # creating a dynamic progress bar.
        # Calculation for estimated time: (remaining_items / batch_size) * average_time_per_batch
        print(f"Progress: {100*(i / max_items):.3f}%;   "
              f"Estimated time: {((max_items - i) / batch) * np.mean(time_for_one_cycle):.3f} seconds\t",
              end="\r")

In [13]:
upload_embeddings_to_database(embeddings_dict, batch = 128)


Progress: 99.856%;   Estimated time: 0.008 seconds	

In [14]:
client.create_payload_index(
    collection_name= COLLECTION_NAME,
    field_name = "outfit_id",
    field_schema = models.PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=490, status=<UpdateStatus.COMPLETED: 'completed'>)

In [15]:
while True:
    collection_info = client.get_collection(collection_name = COLLECTION_NAME)
    if collection_info.status == models.CollectionStatus.GREEN:
        # Collection status is green, which means the indexing is finished
        print('Indexing finished')
        break

Indexing finished


# Search similar Outfits Function

In [16]:
def search_similar(golden_set_embeddings_path: str,
                   client: QdrantClient,
                   search_params: models.SearchParams,
                   score_threshold: float = 0.7,
                   collection_name: str = COLLECTION_NAME,
                   num_of_outfits_to_return: int = 5):
    """
    Performs a complex similarity search to find the most relevant "outfits"
    from a Qdrant collection based on a "golden set" of wardrobe embeddings.
    
    The function operates in several stages:
    1. Loads the "golden set" (wardrobe) embeddings.
    2. For each item in the wardrobe, queries Qdrant to find similar individual clothing items.
    3. Aggregates the `outfit_id`s from these similar individual items to identify candidate outfits.
    4. For each candidate outfit, retrieves all its associated clothing items from Qdrant.
    5. Calculates a "total_score" for each candidate outfit by finding the best match
       between its items and the wardrobe items, summing their similarity scores.
    6. Filters out outlier clothing items within an outfit that might skew scores.
    7. Returns the top `num_of_outfits_to_return` ranked outfits.

    Args:
        golden_set_embeddings_path (str): Path to the `.txt` file containing the
                                          embeddings of the "golden set" (wardrobe items).
        client (qdrant_client.QdrantClient): An initialized Qdrant client instance.
        search_params (models.SearchParams): Parameters for the Qdrant vector search,
                                           e.g., `hnsw_ef` for HNSW algorithm.
                                           For the "control" search, this should typically
                                           be set with `exact=True` to represent ground truth.
        score_threshold (float, optional): A minimum similarity score for individual
                                           clothing items to be considered "similar"
                                           when querying Qdrant. Defaults to 0.7.
        collection_name (str, optional): The name of the Qdrant collection where outfit
                                         clothing items are stored. Defaults to `COLLECTION_NAME`.
        num_of_outfits_to_return (int, optional): The maximum number of top-ranked outfits
                                                  to return. Defaults to 5.

    Returns:
        list[dict]: A list of dictionaries, where each dictionary represents a ranked outfit.
                    Each outfit dictionary contains:
                    - "outfit_id" (str): The unique identifier of the outfit.
                    - "total_score" (float): The aggregated similarity score for the outfit.
                    - "matches" (list[dict]): Details of the best matching wardrobe-to-outfit-item pairs.
                    Returns an empty list if no candidate outfits are found or if an error occurs.
    """
    # Load the "golden set" (wardrobe) embeddings from the specified text file.
    # 'outfits=False' tells read_embeddings_txt to parse it in the golden_set format.
    wardrobe_embeddings = read_embeddings_txt(golden_set_embeddings_path, outfits=False)
    
    # Extract just the embedding vectors into a NumPy array for efficient matrix operations later.
    wardrobe_embeddings_vectors = np.array([item["embeddings"] for item in wardrobe_embeddings])

    # Initialize a set to store unique outfit IDs found during the initial broad search.
    # A set is used to automatically handle duplicates.
    candidate_outfits_ids = set()

    # Step 1: Initial broad search to find candidate outfits.
    # For each embedding in the wardrobe (golden set):
    for i in range(len(wardrobe_embeddings)):
        # Use the current wardrobe item's embedding as the query vector.
        query_vector = wardrobe_embeddings[i]["embeddings"]
        
        # Query Qdrant to find clothing items similar to the current wardrobe item.
        # 'limit=50' retrieves up to 50 similar points for each wardrobe item query.
        # 'search_params' (e.g., hnsw_ef) controls the search algorithm's behavior.
        # 'score_threshold' filters out very dissimilar individual clothing items.
        similar_points = client.query_points(
            collection_name=collection_name, # Query the collection of all individual clothing items
            query=query_vector,
            limit=50,
            search_params=search_params, 
            score_threshold=score_threshold
        ).points
        
        # From the similar individual clothing items found, extract their associated outfit IDs.
        for point in similar_points:
            # Ensure the payload contains 'outfit_id' before attempting to access it.
            if "outfit_id" in point.payload:
                # Add the outfit_id to the set of candidates.
                candidate_outfits_ids.add(point.payload["outfit_id"])
                
    # If no candidate outfits were identified after iterating through all wardrobe items,
    # print a message and return an empty list.
    if not candidate_outfits_ids:
        print("No candidate outfits were found.")
        return []
    
    # Initialize a list to store dictionaries of ranked outfits, including their scores and matches.
    ranked_outfits = []
    
    # Step 2: Refine and score each candidate outfit.
    # For each unique outfit_id identified in the previous step:
    for outfit_id in candidate_outfits_ids:
        # Retrieve all individual clothing items that belong to the current outfit.
        # 'scroll' is used to efficiently retrieve points that match a specific filter.
        # The filter is set to find all points where 'outfit_id' matches the current outfit_id.
        records, next_offset = client.scroll(
            collection_name=collection_name,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="outfit_id",
                        match=models.MatchValue(value=outfit_id)
                    )
                ]
            ),
            limit=100, # Retrieve up to 100 items per outfit. Adjust if outfits can be larger.
            with_payload=True, # Ensure payload (metadata) is returned with each record.
            with_vectors=True # Ensure embedding vectors are returned with each record.
        )
        
        # If no records are found for an outfit ID (shouldn't happen if outfit_id came from a valid point),
        # or if there's an issue, print an error and return.
        if not records:
            print(f"No clothes for outfit_id '{outfit_id}' were found, some error occurred.")
            return []
        
        # Extract embeddings and IDs of the clothing items belonging to the current outfit.
        outfit_item_embeddings = np.array([record.vector for record in records])
        outfit_item_ids = [record.id for record in records]

        # An outfit must consist of at least two items to be considered valid.
        # This prevents single-item "outfits" from being scored.
        if len(outfit_item_ids) < 2:
            continue # Skip this outfit if it's too small.
            
        # Calculate the similarity matrix between all wardrobe item embeddings and all
        # items in the current candidate outfit.
        # The result is a matrix where rows are wardrobe items and columns are outfit items.
        # Each cell (i, j) contains the dot product (similarity) between wardrobe_embeddings_vectors[i]
        # and outfit_item_embeddings[j].
        similarity_matrix = np.dot(wardrobe_embeddings_vectors, outfit_item_embeddings.T)
        
        # For each outfit item, find the wardrobe item that matches it best.
        # `best_matches_indices` stores the index of the best-matching wardrobe item for each outfit item.
        # `best_matches_scores` stores the similarity score of that best match.
        best_matches_indices = np.argmax(similarity_matrix, axis=0)
        best_matches_scores = np.max(similarity_matrix, axis=0)
        
        # Initialize a list to store details of the matched items within the outfit.
        matches = []
        
        # Calculate the mean of the best match scores for the current outfit.
        best_matches_mean = np.mean(best_matches_scores)
        
        # Iterate through each clothing item in the current outfit to process its best match.
        for i, outfit_item_id in enumerate(outfit_item_ids):
            # IMPORTANT CONSIDERATION: Outlier Detection based on similarity score.
            # If an outfit item's best match score is significantly lower than the average
            # best match score for the entire outfit, it's considered an outlier.
            # This helps to filter out items that might be irrelevant or poorly embedded,
            # preventing them from negatively impacting the total outfit score.
            # The threshold (0.75 * mean) is heuristic and can be tuned.
            if best_matches_scores[i] < best_matches_mean * 0.75:
                # "if some item has very low similarity score compared to others items
                # then it means it is outlier, and probably will disturb the prediction"
                # This logic is based on the observation that certain item types (like boots)
                # might inherently have lower scores, and this threshold aims to filter
                # out items that are too dissimilar within the context of the current outfit's matches.
                # "boots usually have low score" - this note explains a specific domain observation
                # justifying the need for such an outlier filter.
                continue # Skip this outlier item.
            
            # Get the index of the best-matching wardrobe item.
            wardrobe_idx = int(best_matches_indices[i])
            
            # Add details of the matched pair to the 'matches' list.
            matches.append({
                "wardrobe_image_index": wardrobe_idx,
                "wardrobe_image_id": wardrobe_embeddings[wardrobe_idx]["cloth_id"],
                "outfit_item_id": str(outfit_item_id),
                "score": float(best_matches_scores[i])
            })
        
        # Calculate the total score for the current outfit.
        # This is the sum of the best match scores (after potential outlier filtering).
        # "if we take np.mean(), for some reason it will ignore boots in the wardrobe
        # so that it will search only for outfits with tshirt and pants
        # given the fact that boots always have lower scores (24 vs 35 for other)
        # we will always ignore outfits with boots"
        # This note explains why `np.sum` is used instead of `np.mean` for `total_score`.
        # Using mean might inadvertently penalize outfits containing items (like boots)
        # that naturally have lower, but still acceptable, similarity scores.
        # Summing avoids this by favoring outfits with more relevant overall matches.
        total_score = np.sum(best_matches_scores)
        
        # Append the scored outfit to the list of ranked outfits.
        ranked_outfits.append({
            "outfit_id": outfit_id,
            "total_score": total_score,
            "matches": matches # Include the details of the best matches
        })
        
    # Sort the outfits by their 'total_score' in descending order
    # to get the top-ranked outfits.
    ranked_outfits.sort(key=lambda x: x["total_score"], reverse=True)
    
    # Return only the top 'num_of_outfits_to_return' outfits.
    return ranked_outfits[:num_of_outfits_to_return]

In [17]:
def get_process_metrics():
    process = psutil.Process(os.getpid())
    cpu_percent = process.cpu_percent(interval = None)
    ram_info = process.memory_info()
    ram_used_mb = ram_info.rss / (1024 * 1024)
    return {"cpu_percent": cpu_percent, "ram_used_mb": ram_used_mb}

In [18]:
def get_overall_memory_usage(client: QdrantClient,
                             collection_name: str = COLLECTION_NAME):
    number_of_vectors = client.get_collection(collection_name = collection_name).points_count
    vector_size_vector = client.get_collection(collection_name = collection_name).config.params.vectors.size
    memory_size_vector = number_of_vectors * vector_size_vector * 4 * 1.5 
    memory_size_mb_vector = memory_size_vector / (1024 * 1024)
    
    payload_size = number_of_vectors * 56 * 1.5
    # 56 stands for bytes occupied by the "outfit_id" payload. This is actually approximate
    payload_size_mb = payload_size / (1024 * 1024)
    
    return {
        "vectors_size_mb": memory_size_mb_vector,
        "payload_size_mb": payload_size_mb,
        "overall_memory_usage_mb": memory_size_mb_vector + payload_size_mb
    }

In [19]:
def get_exact_search_metrics(client: QdrantClient,
                             golden_set_embeddings_path: str,
                             collection_name: str = COLLECTION_NAME,
                             k: int = 5):
    client.update_collection(
        collection_name = collection_name,
        hnsw_config = models.HnswConfigDiff(
            m = 0
        ),
    )

    perfmetrics_control_local_begin = get_process_metrics()
    search_time_control_begin = time.time()
    search_results_control = search_similar(
            golden_set_embeddings_path=golden_set_embeddings_path,
            client=client,
            search_params=models.SearchParams(exact = True, quantization = models.QuantizationSearchParams(ignore = True)), 
            num_of_outfits_to_return=k
    )
    search_time_control_end = time.time()
    perfmetrics_control_local_end = get_process_metrics()
    cached_results = {
            "search_results_control": search_results_control,
            "perfmetrics_control_local_begin": perfmetrics_control_local_begin,
            "search_time_control_begin": search_time_control_begin,
            "search_time_control_end": search_time_control_end,
            "perfmetrics_control_local_end": perfmetrics_control_local_end
    }
        
        
    return cached_results
        

In [20]:
def calculate_metrics_for_configuration(client: QdrantClient,
                                        golden_set_embeddings_path: str,
                                        cached_results,
                                        search_params_control: models.SearchParams,
                                        search_params_test: models.SearchParams,
                                        k: int = 5,
                                        ):
    """
    Measures search latency, accuracy metrics (Precision, Recall, NDCG), and local 
    resource usage for two different search configurations.

    Args:
        client (QdrantClient): The Qdrant client instance.
        golden_set_embeddings_path (str): Path to the embeddings for the queries.
        search_params_control (models.SearchParams): Search parameters for the ground truth search (should be exact=True).
        search_params_test (models.SearchParams): Search parameters for the HNSW test search.
        k (int): The number of top results to consider (k in @k).

    Returns:
        dict: A dictionary containing all calculated performance metrics.
    """
    
    
    # --- Execute Control Search (Ground Truth) ---
    perfmetrics_control_local_begin = cached_results["perfmetrics_control_local_begin"]
    search_time_control_begin = cached_results["search_time_control_begin"]
    search_results_control = cached_results["search_results_control"]
    search_time_control_end = cached_results["search_time_control_end"]
    perfmetrics_control_local_end = cached_results["perfmetrics_control_local_end"]

    
    # --- Execute Test Search (HNSW) ---
    perfmetrics_test_local_begin = get_process_metrics()
    search_time_test_begin = time.time()
    search_results_test = search_similar(
        golden_set_embeddings_path=golden_set_embeddings_path,
        client=client,
        search_params=search_params_test,
        num_of_outfits_to_return=k
    )
    search_time_test_end = time.time()
    perfmetrics_test_local_end = get_process_metrics()
    
    # --- Extract relevant data for metric calculation ---
    # IDs from the ground truth search results
    ground_truth_outfit_ids = np.array([result["outfit_id"] for result in search_results_control])
    # IDs from the HNSW test search results
    test_outfit_ids = np.array([result["outfit_id"] for result in search_results_test])
    # Scores from the HNSW test search results
    test_scores = np.array([result["total_score"] for result in search_results_test])
    
    # --- Calculate Accuracy Metrics ---
    
    # Create a binary relevance vector for the test results based on the ground truth.
    # 1 if the test result ID is in the ground truth set, 0 otherwise.
    y_true_relevance = np.array([1 if outfit_id in ground_truth_outfit_ids else 0 for outfit_id in test_outfit_ids])
    y_true_relevance_2d = y_true_relevance.reshape(1, -1)
    
    # Reshape the test scores for scikit-learn
    test_scores_2d = test_scores.reshape(1, -1)
    
    # Calculate Precision@k: fraction of retrieved documents that are relevant
    precision_at_k = np.sum(y_true_relevance) / k
    
    # Calculate Recall@k: fraction of relevant documents that are retrieved
    # Denominator is the size of the ground truth set (k in this case)
    recall_at_k = np.sum(y_true_relevance) / len(ground_truth_outfit_ids)
    
    # Calculate NDCG@k: quality of the ranked list, penalizing lower-ranked relevant items
    ndcg_at_k = ndcg_score(y_true=y_true_relevance_2d, y_score=test_scores_2d, k=k)
    
    # --- Calculate Time and Resource Metrics ---
    time_elapsed_for_control = search_time_control_end - search_time_control_begin
    time_elapsed_for_test = search_time_test_end - search_time_test_begin
    
    avg_cpu_perc_control = np.mean([perfmetrics_control_local_begin["cpu_percent"], perfmetrics_control_local_end["cpu_percent"]])
    avg_cpu_perc_test = np.mean([perfmetrics_test_local_begin["cpu_percent"], perfmetrics_test_local_end["cpu_percent"]])
    
    avg_ram_control = np.mean([perfmetrics_control_local_begin["ram_used_mb"], perfmetrics_control_local_end["ram_used_mb"]])
    avg_ram_test = np.mean([perfmetrics_test_local_begin["ram_used_mb"], perfmetrics_test_local_end["ram_used_mb"]])
    

    # --- Return all metrics in a clear dictionary ---
    return {
        "time_elapsed_control_s": time_elapsed_for_control,
        "time_elapsed_test_s": time_elapsed_for_test,
        "local_cpu_perc_control_avg": avg_cpu_perc_control,
        "local_cpu_perc_test_avg": avg_cpu_perc_test,
        "local_ram_control_mb_avg": avg_ram_control,
        "local_ram_test_mb_avg": avg_ram_test,
        "precision_at_k": precision_at_k,
        "recall_at_k": recall_at_k,
        "ndcg_at_k": ndcg_at_k
    }


In [21]:
cached_metrics = get_exact_search_metrics(client, "dino_small_embeddings_men_casual.txt", COLLECTION_NAME)

In [22]:
def grid_search_parameters(
                           client: QdrantClient,
                           golden_set_embeddings_path: str,
                           df_list,
                           df_types: str,
                           collection_name: str = COLLECTION_NAME,
                           cached_metrics = cached_metrics
                           ):
    dict_params_list = [df.to_dict(orient='records') for df in df_list]
    total_number_of_combinations = sum(df.shape[0] for df in df_list)
    all_metrics = []
    count_passed = 0
    times_went = [0]
    print(f"Starting grid search, total number of combinations: {total_number_of_combinations}")
    for dict_params, df_type in zip(dict_params_list, df_types):
        for param in dict_params:
            print(f"<------------------ ITERATION {count_passed + 1} ------------------>")
            t1 = time.time()
            print(f"Progress: {100 * count_passed / total_number_of_combinations:.3f}%; Estimated time: {(total_number_of_combinations - count_passed) * np.mean(times_went):.3f} seconds")
            print(f"Going through the following parameters: {param}")
            if df_type == 'binary':
                client.update_collection(
                    collection_name = collection_name,
                    hnsw_config = models.HnswConfigDiff(
                        m = param["m"],
                        ef_construct = param["ef_construct"]
                    ),
                )
                
                while True:
                    time.sleep(3)
                    
                    collection_info = client.get_collection(collection_name = COLLECTION_NAME)
                    
                    if collection_info.status == models.CollectionStatus.GREEN:
                        print("Indexing finished for these params")
                        break
                    
                metrics = calculate_metrics_for_configuration(client = client, 
                        golden_set_embeddings_path = golden_set_embeddings_path,
                        cached_results = cached_metrics,
                        search_params_control = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = True,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], oversampling = param["oversampling"])),
                        search_params_test = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = False,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], oversampling = param["oversampling"])))                     

            elif df_type == 'product':
                
                if param["compression"] == 16:
                    client.update_collection(
                        collection_name = collection_name,
                        hnsw_config = models.HnswConfigDiff(
                            m = param["m"], 
                            ef_construct = param["ef_construct"]
                        ),
                        quantization_config=models.ProductQuantization(
                            product=models.ProductQuantizationConfig(
                                compression=models.CompressionRatio.X16,
                                always_ram=True,
                            ),
                        )
                    )
                elif param["compression"] == 32:
                    client.update_collection(
                        collection_name = collection_name,
                        hnsw_config = models.HnswConfigDiff(
                            m = param["m"], 
                            ef_construct = param["ef_construct"]
                        ),
                        quantization_config=models.ProductQuantization(
                            product=models.ProductQuantizationConfig(
                                compression=models.CompressionRatio.X32,
                                always_ram=True,
                            ),
                        )
                    )
                elif param["compression"] == 64: 
                    client.update_collection(
                        collection_name = collection_name,
                        hnsw_config = models.HnswConfigDiff(
                            m = param["m"], 
                            ef_construct = param["ef_construct"]
                        ),
                        quantization_config=models.ProductQuantization(
                            product=models.ProductQuantizationConfig(
                                compression=models.CompressionRatio.X64,
                                always_ram=True,
                            ),
                        )
                    )
                
                while True:
                    time.sleep(3)
                    collection_info = client.get_collection(collection_name = COLLECTION_NAME)
                    if collection_info.status == models.CollectionStatus.GREEN:
                        print("Indexing finished for these params")
                        break
                    
                metrics = calculate_metrics_for_configuration(client = client, 
                        golden_set_embeddings_path = golden_set_embeddings_path,
                        cached_results = cached_metrics,
                        search_params_control = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = True,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], 
                                    oversampling = param["oversampling"])),
                        search_params_test = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = False,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], 
                                    oversampling = param["oversampling"])))   
                
            elif df_type == 'scalar':
                    
                client.update_collection(
                    collection_name = collection_name,
                    hnsw_config = models.HnswConfigDiff(
                            m = param["m"],
                            ef_construct = param["ef_construct"]
                    ),
                    quantization_config=models.ScalarQuantization(
                        scalar=models.ScalarQuantizationConfig(
                            type=models.ScalarType.INT8,
                            quantile=param["quantile"],
                            always_ram=True,
                        ),
                    ),
                )
                
                while True:
                    time.sleep(3)
                    collection_info = client.get_collection(collection_name = COLLECTION_NAME)
                    if collection_info.status == models.CollectionStatus.GREEN:
                        print("Indexing finished for these params")
                        break
                    
                metrics = calculate_metrics_for_configuration(client = client, 
                        golden_set_embeddings_path = golden_set_embeddings_path,
                        cached_results = cached_metrics,
                        search_params_control = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = True,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], oversampling = param["oversampling"])),
                        search_params_test = models.SearchParams(hnsw_ef = param["hnsw_ef"], exact = False,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = param["rescore"], oversampling = param["oversampling"])))    
            
            count_passed += 1
            all_metrics.append({**param, **metrics})
            t2 = time.time()
            if count_passed == 1:
                    times_went = []
            dt = t2 - t1
            times_went.append(dt) 
    
    
    
    df = pd.DataFrame(all_metrics)
    return df
    

In [23]:
results = grid_search_parameters(client=client,
                                 golden_set_embeddings_path="dino_small_embeddings_men_casual.txt",
                                 df_list = [
                                     pd.read_csv('combinations_product.csv').drop(columns=["Unnamed: 0"]),
                                     pd.read_csv('combinations_binary.csv').drop(columns=["Unnamed: 0"]),
                                     pd.read_csv('combinations_scalar.csv').drop(columns=["Unnamed: 0"]),
                                 ],
                                 df_types= ['product', 'binary', 'scalar'])

Starting grid search, total number of combinations: 160
<------------------ ITERATION 1 ------------------>
Progress: 0.000%; Estimated time: 0.000 seconds
Going through the following parameters: {'m': 8, 'hnsw_ef': 16, 'ef_construct': 16, 'oversampling': 1, 'rescore': False, 'compression': 32}
Indexing finished for these params
<------------------ ITERATION 2 ------------------>
Progress: 0.625%; Estimated time: 570.084 seconds
Going through the following parameters: {'m': 8, 'hnsw_ef': 16, 'ef_construct': 16, 'oversampling': 1, 'rescore': False, 'compression': 64}
Indexing finished for these params
<------------------ ITERATION 3 ------------------>
Progress: 1.250%; Estimated time: 590.250 seconds
Going through the following parameters: {'m': 8, 'hnsw_ef': 16, 'ef_construct': 16, 'oversampling': 1, 'rescore': True, 'compression': 32}
Indexing finished for these params
<------------------ ITERATION 4 ------------------>
Progress: 1.875%; Estimated time: 592.219 seconds
Going through 

In [66]:
results.to_csv(f"results_{EMBEDDINGS_FILE}.csv")

In [24]:
results_suitable = results[(results.precision_at_k >= 1.0) & (results.recall_at_k >= 1.0)]

In [25]:
results_suitable_sorted = results_suitable.sort_values(by='time_elapsed_test_s', ascending=True)
results_suitable_sorted

Unnamed: 0,m,hnsw_ef,ef_construct,oversampling,rescore,compression,time_elapsed_control_s,time_elapsed_test_s,local_cpu_perc_control_avg,local_cpu_perc_test_avg,local_ram_control_mb_avg,local_ram_test_mb_avg,precision_at_k,recall_at_k,ndcg_at_k,quantile
35,16,16,16,1,True,64.0,0.464636,0.450544,0.0,0.0,1141.484375,109.929688,1.0,1.0,1.0,
99,8,16,16,1,True,,0.464636,0.531488,0.0,0.0,1141.484375,108.695312,1.0,1.0,1.0,0.99
113,8,64,16,1,False,,0.464636,0.551282,0.0,0.0,1141.484375,105.000000,1.0,1.0,1.0,0.99
0,8,16,16,1,False,32.0,0.464636,0.571450,0.0,0.0,1141.484375,1156.101562,1.0,1.0,1.0,
41,16,16,32,1,False,64.0,0.464636,0.614550,0.0,0.0,1141.484375,109.148438,1.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,16,64,16,1,False,,0.464636,2.858412,0.0,0.0,1141.484375,109.414062,1.0,1.0,1.0,0.95
150,16,64,16,4,True,,0.464636,2.886774,0.0,0.0,1141.484375,111.500000,1.0,1.0,1.0,0.95
128,16,16,16,1,False,,0.464636,2.895220,0.0,0.0,1141.484375,1215.570312,1.0,1.0,1.0,0.95
109,8,16,32,4,False,,0.464636,2.933762,0.0,0.0,1141.484375,108.773438,1.0,1.0,1.0,0.99


In [70]:
client.update_collection(
    collection_name = COLLECTION_NAME,
        hnsw_config = models.HnswConfigDiff(
            m = 8,
            ef_construct = 32
        ),
        quantization_config=models.ScalarQuantization(
            scalar=models.ScalarQuantizationConfig(
                type=models.ScalarType.INT8,
                quantile=0.95,
                always_ram=True,
            ),
        ),
    )

True

In [94]:
client.get_collection(collection_name = COLLECTION_NAME)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=55680, points_count=62554, segments_count=6, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=8, ef_construct=32, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0),

In [101]:
results_for_demo = search_similar("dino_base_embeddings_men_casual.txt",
               client,
               search_params = models.SearchParams(hnsw_ef = 16, exact = False,
                                    quantization=models.QuantizationSearchParams(
                                    ignore = False, rescore = True, oversampling = 1)),
               collection_name = COLLECTION_NAME)

In [102]:
results_for_demo

[{'outfit_id': '9078',
  'total_score': np.float64(105.59075819358918),
  'matches': [{'wardrobe_image_index': 2,
    'wardrobe_image_id': 'casual_men_tshirt_0',
    'outfit_item_id': '22568b09-a5b5-4a3e-86ec-8369bd6d9982',
    'score': 35.53486351614366},
   {'wardrobe_image_index': 9,
    'wardrobe_image_id': 'casual_men_boots_0',
    'outfit_item_id': '90fa6f56-7eef-4664-857f-3832cf0dad74',
    'score': 24.67156793179209},
   {'wardrobe_image_index': 3,
    'wardrobe_image_id': 'casual_men_pants_1',
    'outfit_item_id': 'd97371b5-60cf-4105-b5fa-9ccebdebcd01',
    'score': 32.09491485595089}]},
 {'outfit_id': '3610',
  'total_score': np.float64(94.36473049896864),
  'matches': [{'wardrobe_image_index': 9,
    'wardrobe_image_id': 'casual_men_boots_0',
    'outfit_item_id': '0b090248-8999-4638-8241-4048f32d640c',
    'score': 17.81999637426704},
   {'wardrobe_image_index': 3,
    'wardrobe_image_id': 'casual_men_pants_1',
    'outfit_item_id': '706d6730-a77e-4fd5-a993-2f898ad8203a',
