# Benchmarks y Comparaciones de Algoritmos en CUVS

Este notebook compara algoritmos de CUVS (IVF-Flat, IVF-PQ, CAGRA) en un dataset real como SIFT-128, midiendo recall y QPS, y visualizando resultados.

## 1. Instalar RAPIDS y CUVS

Instalamos RAPIDS y CUVS para Colab.

In [None]:
!pip install cuvs-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install h5py

## 2. Importar Librerías Requeridas

Importamos librerías necesarias.

In [None]:
import numpy as np
import cupy as cp
from cuvs.common import Resources
from cuvs.neighbors import ivf_flat, ivf_pq, cagra
import time
import matplotlib.pyplot as plt
import h5py
import urllib.request
import tempfile
import os

## 3. Cargar y Preparar Dataset

Cargamos el dataset SIFT-128 de ANN benchmarks.

In [None]:
def load_sift():
    url = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
    temp_dir = tempfile.gettempdir()
    file_path = os.path.join(temp_dir, "sift.hdf5")
    if not os.path.exists(file_path):
        urllib.request.urlretrieve(url, file_path)
    f = h5py.File(file_path, 'r')
    return f

f = load_sift()
dataset = cp.array(f['train'][:50000])  # Subset for demo
queries = cp.array(f['test'][:1000])
gt_neighbors = cp.array(f['neighbors'][:1000])
gt_distances = cp.array(f['distances'][:1000])
metric = f.attrs['distance']

print(f"Dataset shape: {dataset.shape}, Metric: {metric}")

## 4. Construir Índice CUVS

Construimos índices para IVF-Flat, IVF-PQ y CAGRA.

In [None]:
resources = Resources()

# IVF-Flat
build_params_flat = ivf_flat.IndexParams(n_lists=1024, metric=metric)
index_flat = ivf_flat.build(build_params_flat, dataset, resources=resources)

# IVF-PQ
pq_dim = 64
build_params_pq = ivf_pq.IndexParams(n_lists=1024, metric=metric, pq_dim=pq_dim)
index_pq = ivf_pq.build(build_params_pq, dataset, resources=resources)

# CAGRA
build_params_cagra = cagra.IndexParams(metric=metric)
index_cagra = cagra.build(build_params_cagra, dataset, resources=resources)

resources.sync()
print("Indices built")

## 5. Realizar Búsqueda de Vectores

Ejecutamos búsquedas en todos los índices.

In [None]:
k = 10

# IVF-Flat
search_params_flat = ivf_flat.SearchParams(n_probes=30)
dist_flat, neigh_flat = ivf_flat.search(search_params_flat, index_flat, queries, k=k, resources=resources)

# IVF-PQ
search_params_pq = ivf_pq.SearchParams(n_probes=30)
dist_pq, neigh_pq = ivf_pq.search(search_params_pq, index_pq, queries, k=k, resources=resources)

# CAGRA
search_params_cagra = cagra.SearchParams()
dist_cagra, neigh_cagra = cagra.search(search_params_cagra, index_cagra, queries, k=k, resources=resources)

resources.sync()
print("Searches completed")

## 6. Evaluar Precisión de Búsqueda

Calculamos recall@10 para cada algoritmo.

In [None]:
def recall_at_k(pred, true, k):
    recall = 0
    for i in range(len(pred)):
        recall += len(set(pred[i]) & set(true[i])) / k
    return recall / len(pred)

recall_flat = recall_at_k(cp.asnumpy(neigh_flat), cp.asnumpy(gt_neighbors), k)
recall_pq = recall_at_k(cp.asnumpy(neigh_pq), cp.asnumpy(gt_neighbors), k)
recall_cagra = recall_at_k(cp.asnumpy(neigh_cagra), cp.asnumpy(gt_neighbors), k)

print(f"IVF-Flat Recall@{k}: {recall_flat:.4f}")
print(f"IVF-PQ Recall@{k}: {recall_pq:.4f}")
print(f"CAGRA Recall@{k}: {recall_cagra:.4f}")

## 7. Benchmark de Métricas de Rendimiento

Medimos QPS para cada algoritmo.

In [None]:
# Measure QPS
n_reps = 3

# IVF-Flat
times_flat = []
for _ in range(n_reps):
    start = time.time()
    ivf_flat.search(search_params_flat, index_flat, queries, k=k, resources=resources)
    resources.sync()
    times_flat.append(time.time() - start)
qps_flat = len(queries) / np.mean(times_flat)

# IVF-PQ
times_pq = []
for _ in range(n_reps):
    start = time.time()
    ivf_pq.search(search_params_pq, index_pq, queries, k=k, resources=resources)
    resources.sync()
    times_pq.append(time.time() - start)
qps_pq = len(queries) / np.mean(times_pq)

# CAGRA
times_cagra = []
for _ in range(n_reps):
    start = time.time()
    cagra.search(search_params_cagra, index_cagra, queries, k=k, resources=resources)
    resources.sync()
    times_cagra.append(time.time() - start)
qps_cagra = len(queries) / np.mean(times_cagra)

print(f"IVF-Flat QPS: {qps_flat:.0f}")
print(f"IVF-PQ QPS: {qps_pq:.0f}")
print(f"CAGRA QPS: {qps_cagra:.0f}")

## 8. Comparar con Alternativas Basadas en CPU

Comparamos con FAISS en CPU.

In [None]:
!pip install faiss-cpu
import faiss

# FAISS IndexFlat
index_faiss_flat = faiss.IndexFlatL2(dataset.shape[1])
index_faiss_flat.add(cp.asnumpy(dataset))

start = time.time()
dist_faiss, neigh_faiss = index_faiss_flat.search(cp.asnumpy(queries), k)
time_faiss = time.time() - start

recall_faiss = recall_at_k(neigh_faiss, cp.asnumpy(gt_neighbors), k)
qps_faiss = len(queries) / time_faiss

print(f"FAISS Flat Recall: {recall_faiss:.4f}, QPS: {qps_faiss:.0f}")

## 9. Optimizar Parámetros del Índice

Experimentamos con n_probes para IVF-Flat.

In [None]:
n_probes_list = [1, 5, 10, 20, 50, 100]
recalls = []
qps_list = []

for n_probes in n_probes_list:
    search_params = ivf_flat.SearchParams(n_probes=n_probes)
    start = time.time()
    dist, neigh = ivf_flat.search(search_params, index_flat, queries, k=k, resources=resources)
    resources.sync()
    t = time.time() - start
    r = recall_at_k(cp.asnumpy(neigh), cp.asnumpy(gt_neighbors), k)
    recalls.append(r)
    qps_list.append(len(queries) / t)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(n_probes_list, recalls, marker='o')
plt.xlabel('n_probes')
plt.ylabel('Recall@10')

plt.subplot(1, 2, 2)
plt.plot(recalls, qps_list, marker='o')
plt.xlabel('Recall@10')
plt.ylabel('QPS')
plt.show()

## 10. Manejar Datasets a Gran Escala

Demostramos con un subset más grande.

In [None]:
# Use full SIFT dataset if possible
large_dataset = cp.array(f['train'][:100000])
large_queries = cp.array(f['test'][:5000])

# Build and search on larger dataset
index_large = ivf_flat.build(build_params_flat, large_dataset, resources=resources)
dist_large, neigh_large = ivf_flat.search(search_params_flat, index_large, large_queries, k=k, resources=resources)
resources.sync()

print(f"Large dataset search completed: {large_dataset.shape[0]} vectors, {large_queries.shape[0]} queries")