## FAISS Vector Storage

In [2]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

print(f"FAISS version: {faiss.__version__}")

FAISS version: 1.13.1


In [10]:
# Sample documents
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks.",
    "Data visualization helps communicate insights from complex datasets.",
    "Cloud computing provides on-demand access to computing resources.",
    "Cybersecurity protects systems and networks from digital attacks.",
    "Blockchain technology enables secure, decentralized transactions.",
    "Quantum computing uses quantum mechanics to solve complex problems."
]

In [4]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)

## FAISS Index

In [11]:
dimension = embeddings.shape[1]
#Create FAISS index (IndexFlatL2 = exact search with L2 distance)
index = faiss.IndexFlatL2(dimension)

index.add(embeddings)
print(f"Total vectors in index: {index.ntotal}")

Total vectors in index: 10


## Chroma Vector Database

In [7]:
import chromadb
print(f"Chromadb version: {chromadb.__version__}")

Chromadb version: 1.3.7


In [13]:
#Create Chroma client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(
    name="my_documents",
    metadata={"description": "Sample document collection"}

)
print(collection.name)
print(f"Current count: {collection.count()} documents")

my_documents
Current count: 0 documents


### Add Documents to Chroma

In [15]:

documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks."
]

# metadata for each document
metadatas = [
    {"category": "programming", "topic": "python"},
    {"category": "AI", "topic": "machine learning"},
    {"category": "AI", "topic": "neural networks"},
    {"category": "AI", "topic": "NLP"},
    {"category": "AI", "topic": "deep learning"}
]

ids = [f"doc_{i}" for i in range(len(documents))]

# Add to collection (Chroma handles embedding automatically!)
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids = ids
)

print(f"Total documents: {collection.count()}")

C:\Users\owner\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz:   1%|          | 679k/79.3M [1:58:43<234:43:38, 97.6iB/s]


ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host in add.

### Query Chroma


In [None]:
results = collection.query(
    query_texts = ['What is artificial intelligence?'],
    n_results =3
)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
,
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()