In [1]:
!pip3 install sentence-transformers annoy


Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl (340 kB)
[K     |████████████████████████████████| 340 kB 12.5 MB/s eta 0:00:01
[?25hCollecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 10.8 MB/s eta 0:00:01
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.50.2-py3-none-any.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 17.5 MB/s eta 0:00:01
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 9.7 MB/s  eta 0:00:01
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 13.8 MB/s eta 0:00:01
Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 13.8 M

In [12]:
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np

# Step 1: Initialize Sentence-BERT model for embedding generation
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Example model, can be changed

# Sample documents and a refined query
documents = [
            "This is the first document about artificial intelligence.",
            "Machine learning is a subset of artificial intelligence.",
            "Deep learning models are powerful for many tasks.",
            "Natural language processing focuses on the interaction between computers and human languages.",
            "Reinforcement learning is a type of machine learning where agents learn to make decisions.",
            "this document is talking about how Mohammed is awsome, and cool.",
            "this document is mentioning that mohammed does want to get a sleep"
        ]

refined_query = str(input("please enter you query?"))

# Step 2: Create vectors for documents and the refined query
document_vectors = model.encode(documents)  # Get embeddings for documents

query_vector = model.encode([refined_query])[0]  # Get embedding for the query

# Step 3: Initialize Annoy index
vector_dimension = len(document_vectors[0])  # Dimension of the vector
annoy_index = AnnoyIndex(vector_dimension, 'angular')  # Use angular distance (cosine similarity)


# Step 4: Add document vectors to Annoy index
for i, vector in enumerate(document_vectors):
    annoy_index.add_item(i, vector)

# Build the Annoy index (this is an offline process)
annoy_index.build(10)  # 10 trees for speed/accuracy trade-off
print(annoy_index)
print(100*"-")

# Step 5: Query the Annoy index to find the most similar document
nearest_neighbors = annoy_index.get_nns_by_vector(query_vector, 2, include_distances=True)

# Step 6: Output the results
print(f"Query: {refined_query}")
print("\nMost similar documents:")
for idx, dist in zip(nearest_neighbors[0], nearest_neighbors[1]):
    print(f"Document: {documents[idx]} (Distance: {dist})")


<annoy.Annoy object at 0x13e44b410>
----------------------------------------------------------------------------------------------------
Query: who is Mohammed 

Most similar documents:
Document: this document is talking about how Mohammed is awsome, and cool. (Distance: 0.7285750508308411)
Document: this document is mentioning that mohammed does want to get a sleep (Distance: 0.8530937433242798)


In [11]:
import unittest
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex

class TestAnnoySearchSystem(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        # Set up the model and sample documents
        cls.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        cls.documents = [
            "This is the first document about artificial intelligence.",
            "Machine learning is a subset of artificial intelligence.",
            "Deep learning models are powerful for many tasks.",
            "Natural language processing focuses on the interaction between computers and human languages.",
            "Reinforcement learning is a type of machine learning where agents learn to make decisions.",
            "this document is talking about how Mohammed is awsome, and cool.",
            "this document is mentioning that mohammed does want to get a sleep"
        ]
        cls.refined_query = ""

        # Encode documents and the query
        cls.document_vectors = cls.model.encode(cls.documents)
        cls.query_vector = cls.model.encode([cls.refined_query])[0]

        # Initialize Annoy index
        cls.vector_dimension = len(cls.document_vectors[0])
        cls.annoy_index = AnnoyIndex(cls.vector_dimension, 'angular')

        # Add document vectors to Annoy index
        for i, vector in enumerate(cls.document_vectors):
            cls.annoy_index.add_item(i, vector)

        # Build the index
        cls.annoy_index.build(10)

    def test_document_embedding_generation(self):
        # Test that document vectors are generated properly
        self.assertEqual(len(self.document_vectors), len(self.documents))
        self.assertEqual(len(self.document_vectors[0]), self.vector_dimension)
    
    def test_query_embedding_generation(self):
        # Test that query vector is generated properly
        self.assertEqual(len(self.query_vector), self.vector_dimension)

    def test_annoy_index_creation(self):
        # Test that the Annoy index is built correctly by checking the number of items
        self.assertEqual(self.annoy_index.get_n_items(), len(self.documents))

    def test_similarity_search(self):
        # Perform a similarity search for the query and check if results make sense
        nearest_neighbors = self.annoy_index.get_nns_by_vector(self.query_vector, 3, include_distances=True)
        
        # Assert that we get exactly 3 nearest neighbors
        self.assertEqual(len(nearest_neighbors[0]), 3)

        # Check that the first result is the most similar document (should be related to reinforcement learning)
        expected_document = "Reinforcement learning is a type of machine learning where agents learn to make decisions."
        closest_doc_idx = nearest_neighbors[0][0]
        self.assertEqual(self.documents[closest_doc_idx], expected_document)

    def test_cosine_similarity_behavior(self):
        # Check that the cosine similarity is reasonable (distance should be small for similar documents)
        nearest_neighbors = self.annoy_index.get_nns_by_vector(self.query_vector, 3, include_distances=True)
        
        for dist in nearest_neighbors[1]:
            # Cosine distance should be between 0 (similar) and 2 (opposite direction)
            self.assertGreaterEqual(dist, 0)
            self.assertLessEqual(dist, 2)

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


.........F
FAIL: test_similarity_search (__main__.TestAnnoySearchSystem)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/yp/tlgq8_fj0418jxt8_9b3rw_80000gn/T/ipykernel_33939/566361203.py", line 60, in test_similarity_search
    self.assertEqual(self.documents[closest_doc_idx], expected_document)
AssertionError: 'this document is talking about how Mohamm[19 chars]ool.' != 'Reinforcement learning is a type of machi[45 chars]ons.'
- this document is talking about how Mohammed is awsome, and cool.
+ Reinforcement learning is a type of machine learning where agents learn to make decisions.


----------------------------------------------------------------------
Ran 10 tests in 9.809s

FAILED (failures=1)


In [15]:
!pip3 install pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 11.0 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 14.1 MB/s eta 0:00:01
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
[K     |████████████████████████████████| 347 kB 20.4 MB/s eta 0:00:01
Installing collected packages: tzdata, pytz, pandas
Successfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
