In [1]:
%%writefile ../src/embed_index.py
#comment to test overwritting

import numpy as np 
from typing import List, Tuple 
from sentence_transformers import SentenceTransformer 
from sklearn.metrics.pairwise import cosine_similarity

def load_embedding_model(name: str="sentence-transformers/all-MiniLM-L6-v2") -> SentenceTransformer:
    return SentenceTransformer(name)
def build_embeddings(texts: List[str], model: SentenceTransformer)-> np.ndarray:
    X= model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    return np.asarray(X, dtype="float32") #converts X elements in float32 

class VectorIndex:
    """Simple index for cosine similarity """
    def __init__(self, embeddings: np.ndarray):
        self.X = np.asarray(embeddings, dtype="float32")
        if self.X.ndim !=2:
            raise ValueError("embeddings must be (n,d)")
    def search(self, q: np.ndarray, top_k: int=8)-> Tuple[np.ndarray, np.ndarray]:
        q = np.asarray(q, dtype="float32")
        if q.ndim ==1: q=q[None, :] #if (d,) converts to (1,d)
        sims= cosine_similarity(q, self.X)[0] #cosine for all corpus
        idx = np.argsort(-sims)[:top_k]  #top-k by descendent score (-sims) because argsort order from smallest to largest
        return idx, sims[idx]

Overwriting ../src/embed_index.py


Testing the code

In [12]:
# 1) Example texts
texts = [
    "El gato se sentó en la alfombra.",
    "Un felino descansando sobre una alfombra suave.",
    "Hoy lloverá en Monterrey según el pronóstico.",
    "Recetas fáciles de pollo a la parrilla."
]

#2) Load model and build embbedings
m = load_embedding_model()
X= build_embeddings(texts, m) #(n,d)
print(X.shape)
print(X)

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.12it/s]

(4, 384)
[[ 0.0019667   0.04314606 -0.03825914 ...  0.09706439 -0.01647028
   0.00430816]
 [-0.00097454  0.04636284 -0.06252134 ...  0.08759326  0.02048792
  -0.01915114]
 [ 0.02842838  0.04052663 -0.00934935 ...  0.06045973  0.07326677
   0.03709169]
 [-0.05502563 -0.00187894 -0.02362817 ...  0.10947131  0.01036286
  -0.01954292]]





In [13]:
#3) Build index
index= VectorIndex(X)

#4) query 
query=["Felino sobre una alfombra"]
q=build_embeddings(query, m)  #(1,d)

#5) search 
I, S =index.search(q, top_k=3)
print("Query:", query)
for rank, (i, s) in enumerate(zip(I, S), 1):
    print(f"{rank}. {texts[i]} (score={s: .3f})")

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.46it/s]

Query: ['Felino sobre una alfombra']
1. Un felino descansando sobre una alfombra suave. (score= 0.867)
2. El gato se sentó en la alfombra. (score= 0.615)
3. Recetas fáciles de pollo a la parrilla. (score= 0.439)



