# 🚀 ChromaDB Performance Benchmark

Comprehensive performance testing for ChromaDB across multiple dimensions and operations.

This notebook will test:
- Vector dimensions: 2 to 2048
- Vector counts: 1K to 100K
- All CRUD operations
- Search performance and quality

In [None]:
# 📦 Install dependencies
!pip install -q chromadb numpy pandas matplotlib

In [None]:
# 📥 Import libraries
import chromadb
from chromadb.config import Settings
import numpy as np
import time
import json
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass, asdict
import pandas as pd
from datetime import datetime
import gc
import matplotlib.pyplot as plt

print("✅ All imports successful!")

In [None]:
# 📋 Benchmark Configuration

# You can modify these settings:
DIMENSIONS = [2, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024, 1536, 2048]
VECTOR_COUNTS = [1000, 5000, 10000, 25000, 50000, 100000]
SEARCH_K = 10
NUM_SEARCHES = 100

print("📊 Test Configuration:")
print(f"   Dimensions: {DIMENSIONS}")
print(f"   Vector Counts: {VECTOR_COUNTS}")
print(f"   Search K: {SEARCH_K}")
print(f"   Number of Searches: {NUM_SEARCHES}")

In [None]:
# Paste the entire ChromaDB benchmark code here
# Copy from chromadb_benchmark.py (the class and functions)

@dataclass
class BenchmarkResult:
    dimension: int
    num_vectors: int
    operation: str
    total_time: float
    avg_time: float
    min_time: float
    max_time: float
    throughput: float
    recall: float = 0.0
    search_quality: float = 0.0

class ChromaDBBenchmark:
    def __init__(self):
        """Initialize ChromaDB client"""
        self.client = chromadb.Client(Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=None,
            anonymized_telemetry=False
        ))
        
    def generate_vectors(self, count: int, dim: int, seed: int = 42) -> Tuple[np.ndarray, List[str], List[Dict]]:
        """Generate normalized random vectors with metadata"""
        np.random.seed(seed)
        vectors = np.random.randn(count, dim).astype(np.float32)
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        vectors = vectors / (norms + 1e-8)
        ids = [f"vec_{i}" for i in range(count)]
        categories = ["tech", "science", "arts", "sports", "news"]
        metadatas = [
            {
                "index": i,
                "category": categories[i % len(categories)],
                "score": float(np.random.random()),
                "group": i % 10,
            }
            for i in range(count)
        ]
        return vectors, ids, metadatas
    
    # Add all other methods from chromadb_benchmark.py here...
    # (For brevity, include the full class definition)

print("✅ Benchmark class defined!")

In [None]:
# 🚀 Run the benchmark!

print("🚀 Starting ChromaDB Comprehensive Performance Benchmark")
print("=" * 60)
print()

benchmark = ChromaDBBenchmark()
all_results = []

# Run benchmarks
for dim in DIMENSIONS:
    print(f"\n📊 Testing Dimension: {dim}")
    print("─" * 60)
    
    for num_vectors in VECTOR_COUNTS:
        # Skip very large combinations
        if dim >= 1536 and num_vectors > 25000:
            continue
        if dim >= 1024 and num_vectors > 50000:
            continue
        
        print(f"\n  📦 Vector Count: {num_vectors}")
        
        results = benchmark.run_benchmark(dim, num_vectors, SEARCH_K, NUM_SEARCHES)
        all_results.extend(results)

print("\n✅ All benchmarks complete!")

In [None]:
# 📊 Display Summary

df = pd.DataFrame([asdict(r) for r in all_results])

print("\n🏆 FINAL PERFORMANCE SUMMARY")
print("=" * 80)
print()

for operation in df['operation'].unique():
    op_data = df[df['operation'] == operation]
    
    print(f"\n📊 {operation}")
    print("─" * 80)
    
    avg_ms = op_data['avg_time'].mean() * 1000
    min_ms = op_data['avg_time'].min() * 1000
    max_ms = op_data['avg_time'].max() * 1000
    
    print(f"  Average: {avg_ms:.3f}ms | Min: {min_ms:.3f}ms | Max: {max_ms:.3f}ms")
    
    if op_data['throughput'].mean() > 0:
        print(f"  Throughput: {op_data['throughput'].mean():.0f} ops/sec (avg)")
    
    if operation == 'exact_search' and op_data['recall'].mean() > 0:
        print(f"  Recall: {op_data['recall'].mean()*100:.2f}%")
    
    if 'search_k' in operation and op_data['search_quality'].mean() > 0:
        print(f"  Search Quality: {op_data['search_quality'].mean()*100:.2f}%")

print("\n✅ Benchmark Complete!")

In [None]:
# 💾 Save Results

results_dict = [asdict(r) for r in all_results]

with open('chromadb_benchmark_results.json', 'w') as f:
    json.dump(results_dict, f, indent=2)

df.to_csv('chromadb_benchmark_results.csv', index=False)

print("💾 Results saved:")
print("   - chromadb_benchmark_results.json")
print("   - chromadb_benchmark_results.csv")

# Download files
from google.colab import files
files.download('chromadb_benchmark_results.json')
files.download('chromadb_benchmark_results.csv')

In [None]:
# 📈 Visualizations

# Search latency by dimension
search_data = df[df['operation'] == 'search_k10']
if len(search_data) > 0:
    plt.figure(figsize=(14, 6))
    
    plt.subplot(1, 2, 1)
    for count in search_data['num_vectors'].unique():
        data = search_data[search_data['num_vectors'] == count]
        plt.plot(data['dimension'], data['avg_time'] * 1000, marker='o', label=f"{count} vectors")
    plt.xlabel('Dimension')
    plt.ylabel('Latency (ms)')
    plt.title('Search Latency vs Dimension')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    for dim in [128, 384, 768, 1024]:
        data = search_data[search_data['dimension'] == dim]
        if len(data) > 0:
            plt.plot(data['num_vectors'], data['avg_time'] * 1000, marker='o', label=f"{dim}D")
    plt.xlabel('Number of Vectors')
    plt.ylabel('Latency (ms)')
    plt.title('Search Latency vs Vector Count')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('chromadb_search_performance.png', dpi=150)
    plt.show()

# Throughput comparison
plt.figure(figsize=(12, 5))

ops = ['batch_insert', 'search_k10']
for i, op in enumerate(ops, 1):
    plt.subplot(1, 2, i)
    op_data = df[df['operation'] == op]
    if len(op_data) > 0:
        grouped = op_data.groupby('dimension')['throughput'].mean()
        plt.bar(range(len(grouped)), grouped.values)
        plt.xticks(range(len(grouped)), grouped.index, rotation=45)
        plt.xlabel('Dimension')
        plt.ylabel('Throughput (ops/sec)')
        plt.title(f'{op.replace("_", " ").title()} Throughput')
        plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('chromadb_throughput.png', dpi=150)
plt.show()

print("📊 Visualizations saved:")
print("   - chromadb_search_performance.png")
print("   - chromadb_throughput.png")

In [None]:
# 📋 Summary Table

summary = df.groupby('operation').agg({
    'avg_time': ['mean', 'min', 'max'],
    'throughput': 'mean',
    'recall': 'mean',
    'search_quality': 'mean'
}).round(4)

# Convert times to milliseconds
summary['avg_time'] = summary['avg_time'] * 1000

print("\n📊 Operation Summary")
print(summary)

summary.to_csv('chromadb_summary.csv')
print("\n💾 Summary saved to: chromadb_summary.csv")