In [1]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./db")
collection = chroma_client.get_collection(name="neuclir-titles-ru-bge-m3")

In [7]:
new_collection = chroma_client.get_or_create_collection(name="neuclir-titles-all-bge-m3", metadata={"hnsw:space": "cosine"})

In [8]:
def fetch_data(original_collection, batch_size, offset):
    return original_collection.get(
        ids=None,  # Retrieve all if no specific IDs are given
        limit=batch_size,
        offset=offset
    )

def add_data(new_collection, data):
    new_collection.add(
        ids=data["ids"],
        embeddings=data["embeddings"],
        documents=data["documents"],
        uris=data["uris"],
        metadatas=data["metadatas"],
    )

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

# Retrieve data in batches
def duplicate_collection_in_batches(original_collection, new_collection, batch_size):
    offset = 0
    total_items = original_collection.count()  # Assuming there's a method to get the total count
    with tqdm(total=total_items, desc="Duplicating Collection") as pbar, ThreadPoolExecutor(max_workers=8) as executor:
        futures = []
        while True:
            data = fetch_data(original_collection, batch_size, offset)
            if len(data["ids"]) == 0:
                print("Duplication complete.")
                break

            futures.append(executor.submit(add_data, new_collection, data))
            offset += len(data["ids"])

        for future in as_completed(futures):
            future.result()  # Ensure all threads have completed
            pbar.update(len(data["ids"]))


# Call the duplication function
duplicate_collection_in_batches(collection, new_collection, 1000)

Duplicating Collection:   0%|          | 0/964719 [00:00<?, ?it/s]

Duplication complete.


In [None]:
assert new_collection.get() == collection.get()  # Check if the data matches