<a href="https://colab.research.google.com/github/NITHESH2303/Gen-AI-Intensive-Course/blob/main/Embeddings_%26_Vector_Stores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install beir
%pip install faiss-cpu

Collecting beir
  Downloading beir-2.2.0-py3-none-any.whl.metadata (28 kB)
Collecting sentence-transformers (from beir)
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting pytrec-eval-terrier (from beir)
  Downloading pytrec_eval_terrier-0.5.10-cp313-cp313-macosx_10_13_universal2.whl.metadata (1.1 kB)
Collecting datasets (from beir)
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets->beir)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.17 (from datasets->beir)
  Downloading numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyarrow>=21.0.0 (from datasets->beir)
  Downloading pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets->beir)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets->beir)
  Using cached pandas-2.3.3-cp313-cp313-macosx_11_0_arm6

In [55]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import faiss
import vertexai
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import pytrec_eval

In [56]:
def embedText(texts, model, task, batch_size=5):
  if not texts:
    return np.array([])

  # Get the embedding dimension dynamically from the first embedding
  if isinstance(model, SentenceTransformer):
    dummy_embedding = model.encode([texts[0]])
    embedding_dim = dummy_embedding.shape[1]
  else:
    # Assume Vertex AI TextEmbeddingModel
    inputs = [TextEmbeddingInput(texts[0], task_type=task)]
    dummy_embedding = model.get_embeddings(inputs)
    embedding_dim = len(dummy_embedding[0].values)

  embed_mat = np.zeros((len(texts), embedding_dim))

  for batch_start in range(0, len(texts), batch_size):
    size = min(len(texts) - batch_start, batch_size)
    batch_texts = texts[batch_start:batch_start+size]

    # Check if the model is a SentenceTransformer or Vertex AI model
    if isinstance(model, SentenceTransformer):
      embeddings = model.encode(batch_texts)
    else:
      # Assume Vertex AI TextEmbeddingModel
      inputs = [TextEmbeddingInput(text, task_type=task) for text in batch_texts]
      embeddings = model.get_embeddings(inputs)
      embeddings = np.array([e.values for e in embeddings])

    for i in range(size):
      embed_mat[batch_start+i] = embeddings[i]
  return embed_mat

In [57]:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"
data_path = util.download_and_unzip(url, "datasets")
# Corpus of text chunks, text queries and “gold” set of query to relevant documents dict
corpus, queries, qrels = GenericDataLoader(data_folder="datasets/nfcorpus").load(split="test")

  0%|          | 0/3633 [00:00<?, ?it/s]

In [58]:
import google.colab.auth
google.colab.auth.authenticate_user()

In [59]:
vertexai.init(project="aerobic-amphora-479502-i7", location="asia-south1")
# model = TextEmbeddingModel.from_pretrained("text-embedding-005")
model = SentenceTransformer("all-MiniLM-L6-v2")
doc_ids, docs = zip(*[(doc_id, doc['text']) for doc_id, doc in corpus.items()])
q_ids, questions = zip(*[(q_id, q) for q_id, q in queries.items()])

# Embed the documents and queries jointly using different models

In [60]:
doc_embeddings = embedText(docs[:100], model, "RETRIEVAL_DOCUMENT")
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

In [61]:
test_embed = embedText(["is Water Good for hydration?", "when the sun rises"], model, "RETRIEVAL_QUERY")
s,q = index.search(test_embed, 2)
print(f'Score: {s[0][0]:.2f}, Text: "{docs[q[0][0]]}"')

Score: 1.43, Text: "AIM OF THE STUDY: The Roselle (Hibiscus sabdariffa) was investigated for its uricosuric effect. MATERIALS AND METHODS: A human model with nine subjects with no history of renal stones (non-renal stone, NS) and nine with a history of renal stones (RS) was used in this study. A cup of tea made from 1.5 g of dry Roselle calyces was provided to subjects twice daily (morning and evening) for 15 days. A clotted blood and two consecutive 24-h urine samples were collected from each subject three times: (1) at baseline (control); (2) on days 14 and 15 during the tea drinking period; and (3) 15 days after the tea drinking was stopped (washout). Serum and 24-h urinary samples were analyzed for uric acid and other chemical compositions related to urinary stone risk factors. RESULTS: All analyzed serum parameters were within normal ranges and similar; between the two groups of subjects and among the three periods. Vis-à-vis the urinary parameters, most of the baseline values for

In [62]:
query_embeddings = embedText(questions, model, "RETRIEVAL_QUERY")
q_scores, q_doc_ids = index.search(query_embeddings, 10)

In [63]:
# Create a dict of query to document scores dict for pytrec evaluation
# Multiply scores by -1 for sorting as smaller distance is better score for pytrec eval

search_qrels = { q_ids[i] : { doc_ids[_id] : -1*s.item() for _id, s in zip(q_doc_ids[i], q_scores[i])} for i in range(len(q_ids))}
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg_cut.10','P_1','recall_10'})
eval_results = evaluator.evaluate(search_qrels)


In [64]:
df = pd.DataFrame.from_dict(eval_results, orient='index')
df.mean()

Unnamed: 0,0
P_1,0.037152
recall_10,0.008154
ndcg_cut_10,0.017675
