In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import torch
import faiss
import numpy as np
import logging
from transformers import (
    DPRQuestionEncoder, DPRContextEncoder,
    DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer
)

# Suppress warnings from unused weights
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Load correct models
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Use correct tokenizers
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Encode query
query = "What is Dense Passage Retrieval?"
inputs = question_tokenizer(query, return_tensors="pt")
query_embedding = question_encoder(**inputs).pooler_output.detach().numpy().astype(np.float32)  # Ensure float32

# Encode passages
passages = [
    "Dense Passage Retrieval (DPR) is a neural retrieval method.",
    "BM25 is a traditional keyword-based retrieval method."
]

# Efficient passage encoding
passage_embeddings = []
for passage in passages:
    inputs = context_tokenizer(passage, return_tensors="pt")
    embedding = context_encoder(**inputs).pooler_output.detach().numpy().astype(np.float32)  # Ensure float32
    passage_embeddings.append(embedding)

# Convert list to NumPy array with proper shape
passage_embeddings = np.vstack(passage_embeddings)  # Stack correctly for FAISS

# Create FAISS index
index = faiss.IndexFlatL2(passage_embeddings.shape[1])
index.add(passage_embeddings)

# Search for the closest passage
D, I = index.search(query_embedding, k=1)
print(f"Most relevant passage: {passages[I[0][0]]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Most relevant passage: Dense Passage Retrieval (DPR) is a neural retrieval method.


In [None]:
import faiss
import numpy as np

# Create a dataset of 1000 random 128-dimensional vectors
d = 128
nb = 1000
np.random.seed(42)
data = np.random.random((nb, d)).astype('float32')

# Create an index and add data
index = faiss.IndexFlatL2(d)
index.add(data)

# Generate a random query vector
query = np.random.random((1, d)).astype('float32')

# Perform ANN search
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(query, k)

print("Nearest Neighbors:", indices)
print("Distances:", distances)

Nearest Neighbors: [[468 771  12 475 284]]
Distances: [[15.351301 16.348877 16.365719 16.400562 16.520393]]
