
# Cargando datos


In [1]:
from google.colab import drive
import pickle
import numpy as np

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
dataset_path = "/content/drive/MyDrive/dataset/the_last_one.pkl"

In [19]:
# Open a file and use dump()
with open(dataset_path, 'rb') as file:
    # A new file will be created
    dataset_nested = pickle.load(file)

In [20]:
image_names = []
feature_vectors = []

for i in dataset_nested:
  if len(i["embedding"]) == 0:
    continue
  image_names.append(i["image_name"])
  feature_vectors.append(i["embedding"][0])

feature_vectors = np.array(feature_vectors)
feature_vectors = feature_vectors.astype('float32')

In [5]:
query_vectors = feature_vectors[:4]
query_names = image_names[:4]
d = query_vectors.shape[1]

In [6]:
!pip install faiss-gpu



# Creando índice

In [7]:
import faiss

In [8]:
res = faiss.StandardGpuResources()  # use a single GPU

In [9]:
# Normalize to use cosine similarity
faiss.normalize_L2(feature_vectors)
faiss.normalize_L2(query_vectors)

In [11]:
## Using an IVF index
nlist = 100
quantizer = faiss.IndexFlatIP(d)  # the cpu index
index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)

# make it an IVF GPU index
gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)


In [12]:
if not gpu_index_ivf.is_trained:
  # train with the vectors. Needs to be done because the gpu is used
  gpu_index_ivf.train(feature_vectors)

gpu_index_ivf.add(feature_vectors)          # add vectors to the index

# Realizando búsqueda

In [13]:
k = 4                          # we want to see 4 nearest neighbors
D, I = gpu_index_ivf.search(query_vectors, k)  # actual search
print(I.shape)
print(I)

(4, 4)
[[    0  7329 16872 36584]
 [    1 12010 17530 15878]
 [    2 19800  1144 17921]
 [    3 26632  3765 36271]]


# Devolviendo paths de imagenes

In [14]:
# Map indices to image names
paths = []
for query_idx, neighbors in enumerate(I):
    print(f"Query Image: {query_names[query_idx]}")
    current = []
    for rank, neighbor_idx in enumerate(neighbors):
      current.append(image_names[neighbor_idx])
      print(f"  Rank {rank + 1}: {image_names[neighbor_idx]} (Index: {neighbor_idx})")
    # La primera imagen es la misma, así que no lo contamos
    paths.append(current[1:])
    print()

Query Image: 15970.jpg
  Rank 1: 15970.jpg (Index: 0)
  Rank 2: 20314.jpg (Index: 7329)
  Rank 3: 33060.jpg (Index: 16872)
  Rank 4: 20154.jpg (Index: 36584)

Query Image: 39386.jpg
  Rank 1: 39386.jpg (Index: 1)
  Rank 2: 24755.jpg (Index: 12010)
  Rank 3: 30412.jpg (Index: 17530)
  Rank 4: 58411.jpg (Index: 15878)

Query Image: 59263.jpg
  Rank 1: 59263.jpg (Index: 2)
  Rank 2: 49870.jpg (Index: 19800)
  Rank 3: 37488.jpg (Index: 1144)
  Rank 4: 49871.jpg (Index: 17921)

Query Image: 21379.jpg
  Rank 1: 21379.jpg (Index: 3)
  Rank 2: 21381.jpg (Index: 26632)
  Rank 3: 34202.jpg (Index: 3765)
  Rank 4: 14392.jpg (Index: 36271)



In [15]:
paths

[['20314.jpg', '33060.jpg', '20154.jpg'],
 ['24755.jpg', '30412.jpg', '58411.jpg'],
 ['49870.jpg', '37488.jpg', '49871.jpg'],
 ['21381.jpg', '34202.jpg', '14392.jpg']]

# Guardando índice

In [18]:
cpu_index = faiss.index_gpu_to_cpu(gpu_index_ivf)

faiss.write_index(cpu_index, "/content/drive/MyDrive/dataset/index.faiss")

with open("/content/drive/MyDrive/dataset/image_names.pkl", "wb") as file:
  pickle.dump(image_names, file)
