In [1]:
import os
import faiss
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))  # /src

from semantic_search.misc import LocalEmbeddingModel, FAISSDocumentStore

  from .autonotebook import tqdm as notebook_tqdm


### Create DB

In [4]:
from importlib import reload
from semantic_search import misc
reload(misc)
from semantic_search.misc import LocalEmbeddingModel, FAISSDocumentStore

In [7]:
embedding_model = LocalEmbeddingModel(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    device='mps',
    batch_size=8
)
document_store = FAISSDocumentStore(embedding_model)

document_store.create_index('/Users/luis/Desktop/ETH/Courses/SS25-DSL/raw-data/challenge10_batch_1/CVPR_2024/Conversions/opencvf-data/txt')

Processing 1142 documents...


100%|██████████| 1142/1142 [00:00<00:00, 3451.46it/s]


Generating embeddings for 73809 chunks...


Generating embeddings:  14%|█▎        | 1268/9227 [04:44<29:47,  4.45it/s]  


KeyboardInterrupt: 

### Old stuff

In [None]:
# Initialize the embedding model and document store
embedding_model = LocalEmbeddingModel()
document_store = FAISSDocumentStore(embedding_model)

# Example usage
data_dir = "example_data"

# Create index if it doesn't exist
if not os.path.exists(f"{document_store.index_path}.faiss"):
    document_store.create_index(data_dir)
else:
    document_store.load_index()

# Example search function
def search_documents(query, top_k=5):
    results = document_store.search(query, top_k=top_k)
    print(f"Search results for: '{query}'")
    for result in results:
        print(f"Rank {result['rank']} (Score: {result['score']:.4f})")
        print(f"Document: {result['document_name']}")
        print(f"Preview: {result['chunk_text']}")
        print("-" * 80)
    return results

# Test the search
search_results = search_documents("visual odometry in robotics applications", top_k=3)


Loaded index with 41 vectors
Search results for: 'visual odometry in robotics applications'
Rank 1 (Score: 0.6195)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt
Preview: omain (real data) while achieving state-of-the-art performance on the KITTI dataset.

## 1. Introduction

Visual odometry (VO) is a crucial aspect of robotics that enables machines to measure the ego-...
--------------------------------------------------------------------------------
Rank 2 (Score: 0.5684)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt
Preview: This CVPR Workshop paper is the Open Access version, provided by the Computer Vision Foundation.

Except for this watermark, it is identical to the accepted version;

the final published version of th...
--------------------------------------------------------------------------------
Rank 3 (Score: 0.5650)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.

In [None]:
document_store.search("visual odometry in robotics applications"*100000, top_k=3)

[{'rank': 1,
  'score': np.float32(0.506165),
  'document_id': 0,
  'document_name': 'Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'document_path': 'example_data/Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'chunk_text': 'omain (real data) while achieving state-of-the-art performance on the KITTI dataset.\n\n## 1. Introduction\n\nVisual odometry (VO) is a crucial aspect of robotics that enables machines to measure the ego-...'},
 {'rank': 2,
  'score': np.float32(0.47257516),
  'document_id': 0,
  'document_name': 'Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'document_path': 'example_data/Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'chunk_text': 'ons on robotics , 33(5):1255-1262, 2017. 2, 6\n- [14] David Nist´ er, Oleg Naroditsky, and James Bergen. Visual\n\n- odometry. In Proceedings of the 2004 IEEE Computer Society Conference on Computer Visi...'

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [32]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to('mps')

batch_texts = ["visual", "Hello my name is Albert"*1000]
tokenizer_out = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

with torch.no_grad():
    # Move tokenizer output to the same device as the model
    encoded_input = {k: v.to(model.device) for k, v in tokenizer_out.items()}
    model_output = model(**encoded_input)

# Perform pooling
batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
# batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)


print(f'{tokenizer_out['input_ids'].shape=}')
print(f'{model_output.pooler_output.shape=}; {model_output.last_hidden_state.shape=}')
print(f'{batch_embeddings.shape=}')


tokenizer_out['input_ids'].shape=torch.Size([2, 512])
model_output.pooler_output.shape=torch.Size([2, 384]); model_output.last_hidden_state.shape=torch.Size([2, 512, 384])
batch_embeddings.shape=torch.Size([2, 384])


In [33]:
torch.norm(batch_embeddings, p=2, dim=1), torch.norm(model_output.pooler_output, p=2, dim=1)

(tensor([6.9925, 2.1388], device='mps:0'),
 tensor([1.2345, 1.3589], device='mps:0'))

In [27]:
model_output.pooler_output - batch_embeddings

tensor([[-1.9884e-01,  4.7030e-01,  7.4832e-02,  3.7633e-02, -1.7654e-01,
          2.6234e-03, -4.2638e-01, -2.6524e-01, -2.6501e-01,  9.0265e-02,
         -3.4190e-01,  7.7723e-01,  1.7366e-01,  4.8052e-02,  2.9466e-01,
         -2.6326e-01, -1.1632e-01, -5.8603e-02,  2.2780e-01,  7.3592e-02,
          3.2930e-01,  1.0823e-01,  2.7045e-01,  1.8069e-01, -2.0760e-01,
         -4.9486e-01,  6.4207e-02, -4.7424e-01, -4.7247e-01,  9.7740e-01,
          2.4827e-01, -2.2980e-01, -4.7967e-01, -3.5014e-01,  5.8310e-01,
         -1.6662e-01, -1.4073e-01, -1.6845e-01,  3.4730e-01,  1.9024e-01,
          6.7369e-01,  1.2889e-01, -1.7597e-01,  4.0943e-01, -1.5266e-01,
          4.6617e-02, -2.3934e-01,  2.7047e-01,  1.4761e-01, -5.4200e-02,
          6.3219e-01,  5.4383e-01,  4.8124e-01,  1.5749e-01, -2.4815e-01,
         -3.2090e-01, -3.8481e-01,  3.4047e-01, -7.2322e-03, -1.9430e-01,
         -7.1731e-01, -2.0531e-01,  1.9530e-01, -2.4815e-01, -1.7804e-01,
         -3.7555e-02, -7.6236e-02,  1.

In [15]:
tokenizer_out

{'input_ids': tensor([[ 101, 5107,  102,    0,    0,    0,    0],
        [ 101, 7592, 2026, 2171, 2003, 4789,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]])}

In [None]:
df = pd.read_parquet("chunk_store.parquet")

In [22]:
# Load the FAISS index directly
faiss_index = faiss.read_index("faiss_document_index.faiss")

# Get basic information about the index
num_vectors = faiss_index.ntotal
dimension = faiss_index.d

print(f"FAISS Index Information:")
print(f"Number of vectors: {num_vectors}")
print(f"Vector dimension: {dimension}")
print(f"Index type: {type(faiss_index).__name__}")

# Load metadata from external source if needed
try:
    # Try to load metadata from a separate file if it exists
    metadata_df = pd.read_parquet("chunk_store.parquet")
    print("\nMetadata from chunk store:")
    print(metadata_df[["doc_id", "chunk_id"]].head())
except Exception as e:
    print(f"Could not load metadata: {str(e)}")

# Get a sample of vectors if possible
print("\nNote: Direct vector access requires custom implementation")
print("Consider using document_store.search() with specific queries to retrieve vectors")

FAISS Index Information:
Number of vectors: 41
Vector dimension: 384
Index type: IndexIDMap

Document Metadata:
Metadata not directly accessible from raw FAISS index

Metadata from chunk store:
   doc_id  chunk_id
0       0         0
1       0         1
2       0         2
3       0         3
4       0         4

Note: Direct vector access requires custom implementation
Consider using document_store.search() with specific queries to retrieve vectors
