In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from collections import Counter
import math

documents = [
    "The cat sits on the mat.",
    "The dog plays in the yard.",
    "A fox jumps over the fence.",
    "The quick brown fox jumps over the lazy dog.",
    "A bird sings in the morning.",
    "The sun rises in the east and sets in the west.",
    "The river flows through the valley.",
    "A fisherman catches fish by the lake.",
    "The stars shine brightly in the night sky.",
    "A child laughs with joy."
]

query = "fox jumps over"

bm25


In [3]:
def compute_idf(corpus):
    N = len(corpus)
    df = Counter()
    for doc in corpus:
        unique_terms = set(doc.split())
        for term in unique_terms:
            df[term] += 1
    return {term: math.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1) for term in df}

In [4]:
def bm25_score(query, doc, idf, avgdl, k1=1.5, b=0.75):
    words = doc.split()
    doc_len = len(words)
    tf = Counter(words)
    score = 0
    for term in query.split():
        if term in tf:
            term_freq = tf[term]
            score += idf.get(term, 0) * ((term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * doc_len / avgdl)))
    return score

In [7]:
idf = compute_idf(documents)
avgdl = sum(len(doc.split()) for doc in documents) / len(documents)
bm25_scores = [bm25_score(query, doc, idf, avgdl) for doc in documents]
bm25_top3 = np.argsort(bm25_scores)[::-1][:3]
print("BM25 Top 3:", [documents[i] for i in bm25_top3])

BM25 Top 3: ['A fox jumps over the fence.', 'The quick brown fox jumps over the lazy dog.', 'A child laughs with joy.']


colbert


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_numpy=True)
query_embedding = model.encode(query, convert_to_numpy=True)
similarities = np.dot(doc_embeddings, query_embedding)
colbert_top3 = np.argsort(similarities)[::-1][:3]
print("ColBERT Approx Top 3:", [documents[i] for i in colbert_top3])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ColBERT Approx Top 3: ['A fox jumps over the fence.', 'The quick brown fox jumps over the lazy dog.', 'The dog plays in the yard.']


faiss

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_numpy=True)
query_embedding = model.encode(query, convert_to_numpy=True)

flat

In [10]:
def l2_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

distances = [l2_distance(embedding, query_embedding) for embedding in doc_embeddings]
faiss_flat_top3 = np.argsort(distances)[:3]
print("FAISS Flat Top 3:", [documents[i] for i in faiss_flat_top3])

FAISS Flat Top 3: ['A fox jumps over the fence.', 'The quick brown fox jumps over the lazy dog.', 'The dog plays in the yard.']


ivf

In [19]:
nlist = 4
clusters = [[] for _ in range(nlist)]
centroids = np.random.choice(len(doc_embeddings), nlist, replace=False)
centroid_vectors = doc_embeddings[centroids]

In [20]:
def assign_to_cluster(embedding):
    return np.argmin([l2_distance(embedding, centroid) for centroid in centroid_vectors])

for i, embedding in enumerate(doc_embeddings):
    cluster_idx = assign_to_cluster(embedding)
    clusters[cluster_idx].append((i, embedding))

In [21]:
query_cluster = assign_to_cluster(query_embedding)
distances = [l2_distance(embedding, query_embedding) for i, embedding in clusters[query_cluster]]
indices = [i for i, _ in clusters[query_cluster]]
faiss_ivf_top3 = [indices[i] for i in np.argsort(distances)[:3]]
print("FAISS IVF Top 3:", [documents[i] for i in faiss_ivf_top3])

FAISS IVF Top 3: ['A fox jumps over the fence.', 'The quick brown fox jumps over the lazy dog.', 'The dog plays in the yard.']
