In [None]:
# First, install the necessary libraries
!pip install faiss-cpu chromadb numpy

In [None]:
import numpy as np
import time

# --- Setup: Create some data ---
# In a real application, these vectors would come from an embedding model.
# For this lab, we'll just use random data to focus on the databases.
num_vectors = 10000
vector_dim = 768  # A common dimension for embedding models

print(f"Generating {num_vectors} random vectors of dimension {vector_dim}...")
# This creates a set of vectors to be stored in the database
db_vectors = np.float32(np.random.rand(num_vectors, vector_dim))
# This creates a different set of vectors to query the database with
query_vectors = np.float32(np.random.rand(100, vector_dim))
print("Data generation complete.")

In [None]:
# --- Part 1: In-Process with FAISS ---
print("\n--- Testing FAISS (In-Process) ---")
import faiss

# 1. Build the index
faiss_index = faiss.IndexFlatL2(vector_dim) # L2 is a distance metric (Euclidean)
print(f"Is the FAISS index trained? {faiss_index.is_trained}")
faiss_index.add(db_vectors)
print(f"Number of vectors in FAISS index: {faiss_index.ntotal}")

# 2. Time the search
start_time = time.time()
# k=5 means we want to find the 5 nearest neighbors for each query vector
distances, indices = faiss_index.search(query_vectors, k=5)
end_time = time.time()
print(f"FAISS search for {len(query_vectors)} vectors took: {end_time - start_time:.4f} seconds.")


In [None]:
# --- Part 2: Client-Server with ChromaDB ---
print("\n--- Testing ChromaDB (Client-Server) ---")
import chromadb

# 1. Set up the client and collection.
# This creates a "persistent" client that saves data to disk in the Colab environment.
client = chromadb.PersistentClient(path="./chroma_db")
# We use get_or_create to avoid re-creating the collection every time we run the script.
# If the collection already exists, we delete it and create a new one to ensure a clean run.
try:
    client.delete_collection(name="shootout_collection")
    print("Deleted existing collection.")
except:
    print("No existing collection to delete.")

collection = client.get_or_create_collection(name="shootout_collection")


# 2. Add the data to the collection
# ChromaDB requires string IDs for each vector. We'll just create simple IDs.
# In a real app, these might be document IDs.
vector_ids = [str(i) for i in range(num_vectors)]

# Define a batch size for adding vectors
batch_size = 1000  # You can adjust this based on your system's memory and ChromaDB's limits

# Calculate the number of batches required
num_batches = (num_vectors + batch_size - 1) // batch_size # Ceiling division

print(f"Adding {num_vectors} vectors to ChromaDB in {num_batches} batches (batch size: {batch_size})...")

# Add vectors in batches
for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, num_vectors)
    batch_ids = vector_ids[start_index:end_index]
    batch_embeddings = db_vectors[start_index:end_index]

    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings
    )
    print(f"Batch {i+1}/{num_batches} added.")


print(f"Number of vectors in ChromaDB collection: {collection.count()}")

# 3. Time the search
start_time = time.time()
# ChromaDB's query format is slightly different
results = collection.query(
    query_embeddings=query_vectors,
    n_results=5
)
end_time = time.time()
print(f"ChromaDB search for {len(query_vectors)} vectors took: {end_time - start_time:.4f} seconds.")

# Clean up ChromaDB from disk for a clean slate on next run
# !rm -rf ./chroma_db