In [26]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.vectorstores import Qdrant as QdrantVectorStore
from langchain_huggingface import HuggingFaceEndpointEmbeddings
import os
from dotenv import load_dotenv
import chromadb
from chromadb import Client as ChromaClient
from chromadb.config import Settings

In [27]:
load_dotenv()

True

In [28]:

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
if not qdrant_url or not qdrant_api_key:
    raise ValueError("set QDRANT_URL and QDRANT_API_KEY in .env file")


In [78]:
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
chroma_api_key = os.getenv("CHROMA_API_KEY")
chroma_tenant = os.getenv("CHROMA_tenant")
HUGGING_FACE_API_TOKEN = os.getenv("HUGGING_FACE_API_TOKEN")

In [30]:
import logging
logger = logging.getLogger("migration")


# Initialize Qdrant client for Qdrant Cloud
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True 
)
logger.info("Qdrant Cloud client initialized")

In [31]:
collection_name="vector_db"
vector_dimension = 384

In [32]:
    # Create collection if it doesn't exist
if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE)
        )
logger.info(f"Created collection: {collection_name}")

In [33]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

def text_split(extracted_Data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunk = text_splitter.split_documents(extracted_Data)
    return text_chunk

extracted = load_pdf(data='E:\AI\AgileForce\Vector_db\Vector-Database-Migration-Open-Source-Tool\data')
test_chunks = text_split(extracted)


  extracted = load_pdf(data='E:\AI\AgileForce\Vector_db\Vector-Database-Migration-Open-Source-Tool\data')


In [34]:
def download():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    return HuggingFaceEndpointEmbeddings(
        model=model_name,
        huggingfacehub_api_token=HUGGING_FACE_API_TOKEN
    )

embeddings = download()


In [35]:
# Embed and upsert to qdrant
qdrant_store = QdrantVectorStore.from_documents(
    documents=test_chunks,
    embedding=embeddings,
    collection_name=collection_name,
    url=qdrant_url,
    api_key=qdrant_api_key,
)


In [82]:
from chromadb import PersistentClient

chroma = PersistentClient(path="./chroma_store")  
chroma_collection = chroma.get_or_create_collection(name="migrated_vectors")


In [89]:
chroma_client= chromadb.CloudClient(
  api_key='ck-83UY8Q48K22Ac8U8SCsMCYz27fPdxg4Er73BGkBm5E8T',
  tenant=chroma_tenant,
  database='migrated_vectors'
)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET https://api.trychroma.com:8000/api/v2/auth/identity "HTTP/1.1 200 OK"
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors "HTTP/1.1 200 OK"


In [90]:
collection = chroma_client.get_or_create_collection(
    name="migrated_vectors",  
)


INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections "HTTP/1.1 200 OK"


In [39]:
def fetch_qdrant_vectors(client, collection_name, batch_size=100):
    all_vectors = []
    next_offset = None

    while True:
        points, next_offset = client.scroll(
            collection_name=collection_name,
            offset=next_offset,
            limit=batch_size,
            with_payload=True,
            with_vectors=True
        )
        if not points:
            break

        for pt in points:
            all_vectors.append({
                "id": str(pt.id),
                "vector": pt.vector,
                "metadata": pt.payload or {}
            })

        if next_offset is None:
            break
    return all_vectors


In [40]:
#Upload to Chroma Cloud with Metadata Truncation

def truncate_metadata(metadata, max_bytes=250):
    """Truncate metadata values to fit within Chroma's byte limits"""
    cleaned_metadata = {}
    
    for key, value in metadata.items():
        # Convert value to string
        value_str = str(value) if not isinstance(value, str) else value
        
        # Check byte size
        byte_size = len(value_str.encode('utf-8'))
        
        if byte_size > max_bytes:
            # Truncate to fit within limit
            truncated = value_str
            while len(truncated.encode('utf-8')) > max_bytes:
                truncated = truncated[:-10]  # Remove 10 chars at a time
            truncated += "..."  # Add ellipsis to indicate truncation
            cleaned_metadata[key] = truncated
        else:
            cleaned_metadata[key] = value_str
    
    return cleaned_metadata

def chunked_upload(collection, vectors, batch_size=100):
    """Upload vectors with metadata size validation"""
    total_uploaded = 0
    
    for i in range(0, len(vectors), batch_size):
        chunk = vectors[i:i + batch_size]
        ids = [v["id"] for v in chunk]
        embeddings = [v["vector"] for v in chunk]
        
        # Clean metadata to fit Chroma limits
        metadatas = [truncate_metadata(v["metadata"]) for v in chunk]

        try:
            collection.add(
                ids=ids,
                embeddings=embeddings,
                metadatas=metadatas
            )
            total_uploaded += len(chunk)
            print(f"Uploaded {total_uploaded} / {len(vectors)} vectors ✅")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
            continue

    return total_uploaded



In [42]:
# Fetch vectors from Qdrant and upload to Chroma
print("Fetching vectors from Qdrant...")
vectors = fetch_qdrant_vectors(qdrant_client, collection_name)
print(f"Found {len(vectors)} vectors to migrate")

print("\nUploading to Chroma Cloud with metadata truncation...")
uploaded_count = chunked_upload(collection, vectors)

Fetching vectors from Qdrant...
Found 400 vectors to migrate

Uploading to Chroma Cloud with metadata truncation...
Uploaded 100 / 400 vectors ✅
Uploaded 200 / 400 vectors ✅
Uploaded 300 / 400 vectors ✅
Uploaded 400 / 400 vectors ✅


In [43]:
results = collection.query(
    query_embeddings=[vectors[0]["vector"]],
    n_results=3
)
print(results)


{'ids': [['ff9e2df7-b9c2-42fc-a3ba-bb39d969dbae', '00864c19-15bd-42a9-8672-8cb8d786dd3f', 'b795957a-da2e-4fa9-9193-36903ef385d7']], 'distances': [[0.0, 0.0, 0.0]], 'embeddings': None, 'metadatas': [[{'page_content': 'The EUROCALL Review, Volume 25, No. 2, September 2017 \n \n 19 \nnamely a research question, description of participants, data collection tools and \nanalysis. This is followe d by the presentation of the results of the study. The article \nclose...', 'metadata': "{'moddate': '2018-03-12T10:24:10-04:00', 'creator': 'Microsoft® Word 2016', 'total_pages': 11, 'creationdate': '2018-03-05T09:43:57+01:00', 'producer': 'Microsoft® Word 2016', 'source': 'E:\\\\AI\\\\AgileForce\\\\Vector_db\\\\Vector-Database-Migration..."}, {'page_content': 'The EUROCALL Review, Volume 25, No. 2, September 2017 \n \n 19 \nnamely a research question, description of participants, data collection tools and \nanalysis. This is followe d by the presentation of the results of the study. The article \nc

In [None]:
# Check metadata sizes to identify the problem

def check_metadata_sizes(vectors):
    """Check metadata sizes and identify problematic entries"""
    problematic_entries = []
    
    for i, vector in enumerate(vectors[:10]):  # Check first 10 for analysis
        metadata = vector.get("metadata", {})
        
        for key, value in metadata.items():
            # Convert to string to measure byte size
            value_str = str(value) if not isinstance(value, str) else value
            byte_size = len(value_str.encode('utf-8'))
            
            print(f"Vector {i}, Key: '{key}', Size: {byte_size} bytes")
            
            if byte_size > 256:
                problematic_entries.append({
                    'vector_index': i,
                    'key': key,
                    'size': byte_size,
                    'value_preview': value_str[:100] + "..." if len(value_str) > 100 else value_str
                })
    
    return problematic_entries

# Check the metadata sizes
problematic = check_metadata_sizes(vectors)
print(f"\nFound {len(problematic)} problematic metadata entries:")
for entry in problematic:
    print(f"Vector {entry['vector_index']}: '{entry['key']}' = {entry['size']} bytes")
    print(f"Preview: {entry['value_preview']}\n")

Vector 0, Key: 'page_content', Size: 431 bytes
Vector 0, Key: 'metadata', Size: 335 bytes
Vector 1, Key: 'page_content', Size: 472 bytes
Vector 1, Key: 'metadata', Size: 335 bytes
Vector 2, Key: 'creator', Size: 21 bytes
Vector 2, Key: 'moddate', Size: 25 bytes
Vector 2, Key: 'total_pages', Size: 4 bytes
Vector 2, Key: 'creationdate', Size: 25 bytes
Vector 2, Key: 'producer', Size: 21 bytes
Vector 2, Key: 'source', Size: 57 bytes
Vector 2, Key: 'page_label', Size: 1 bytes
Vector 2, Key: 'page', Size: 3 bytes
Vector 2, Key: 'author', Size: 7 bytes
Vector 2, Key: 'text', Size: 420 bytes
Vector 3, Key: 'page_content', Size: 497 bytes
Vector 3, Key: 'metadata', Size: 335 bytes
Vector 4, Key: 'page_content', Size: 477 bytes
Vector 4, Key: 'metadata', Size: 335 bytes
Vector 5, Key: 'page_content', Size: 420 bytes
Vector 5, Key: 'metadata', Size: 335 bytes
Vector 6, Key: 'page_content', Size: 498 bytes
Vector 6, Key: 'metadata', Size: 335 bytes
Vector 7, Key: 'creator', Size: 21 bytes
Vector 

In [45]:

collection_info = collection.get()
print(f" Migration Successful!")
print(f"Total vectors in Chroma: {len(collection_info['ids'])}")

# Show a sample of the truncated metadata
if collection_info['metadatas']:
    sample_metadata = collection_info['metadatas'][0]
    print(f"\nSample metadata (truncated to fit Chroma limits):")
    for key, value in sample_metadata.items():
        byte_size = len(str(value).encode('utf-8'))
        print(f"  {key}: {byte_size} bytes - {str(value)[:50]}{'...' if len(str(value)) > 50 else ''}")
        
print(f" Successfully migrated {len(collection_info['ids'])} vectors from Qdrant to Chroma!")

 Migration Successful!
Total vectors in Chroma: 100

Sample metadata (truncated to fit Chroma limits):
  source: 57 bytes - E:\AGILEFORCE\Vector Database Migration\data\resea...
  author: 7 bytes - agimeno
  creator: 21 bytes - Microsoft® Word 2016
  producer: 21 bytes - Microsoft® Word 2016
  creationdate: 25 bytes - 2018-03-05T09:43:57+01:00
  page: 3 bytes - 6.0
  moddate: 25 bytes - 2018-03-12T10:24:10-04:00
  text: 253 bytes - computers for informal English learning (i.e. lear...
  page_label: 1 bytes - 7
  total_pages: 4 bytes - 11.0
 Successfully migrated 100 vectors from Qdrant to Chroma!


Chroma to Qdrant

In [56]:

if not qdrant_url or not qdrant_api_key:
    raise ValueError("Set QDRANT_URL and QDRANT_API_KEY in .env file")
if not chroma_api_key or not chroma_tenant:
    raise ValueError("Set CHROMA_API_KEY, CHROMA_TENANT, and CHROMA_DATABASE in .env file")

In [57]:
# Cell 3: Setup logging and initialize clients
logger = logging.getLogger("migration")
logging.basicConfig(level=logging.INFO)

In [58]:
# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True
)
logger.info("Qdrant Cloud client initialized")

INFO:httpx:HTTP Request: GET https://a39cabe9-e04e-41bc-bfc8-c35e32b99b11.eu-west-2-0.aws.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
INFO:migration:Qdrant Cloud client initialized


In [68]:
# Cell 4: Setup collections
chroma_collection_name = "migrated_vectors"
qdrant_collection_name = "chroma_to_qdrant"
vector_dimension = 384

In [69]:
# Create Qdrant collection if it doesn't exist
if not qdrant_client.collection_exists(qdrant_collection_name):
    qdrant_client.create_collection(
        collection_name=qdrant_collection_name,
        vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE)
    )
logger.info(f"Created Qdrant collection: {qdrant_collection_name}")

INFO:migration:Created Qdrant collection: chroma_to_qdrant


In [70]:
# Get Chroma collection
chroma_collection = chroma_client.get_or_create_collection(name=chroma_collection_name)
logger.info(f"Accessed Chroma collection: {chroma_collection_name}")


INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections "HTTP/1.1 200 OK"
INFO:migration:Accessed Chroma collection: migrated_vectors


In [None]:
# Fetch vectors from Chroma
def fetch_chroma_vectors(collection, batch_size=100):
    """Fetch all vectors from Chroma collection"""
    all_vectors = []
    offset = 0

    while True:
        results = collection.get(
            include=["embeddings", "metadatas"],
            limit=batch_size,
            offset=offset
        )
        
        if not results['ids']:
            break

        for i in range(len(results['ids'])):
            all_vectors.append({
                "id": results['ids'][i],
                "vector": results['embeddings'][i],
                "metadata": results['metadatas'][i] or {}
            })

        offset += batch_size
        logger.info(f"Fetched {len(all_vectors)} vectors so far...")

    return all_vectors

In [None]:
# Metadata truncation function
def truncate_metadata(metadata, max_bytes=250):
    """Truncate metadata values to fit within Qdrant's byte limits"""
    cleaned_metadata = {}
    
    for key, value in metadata.items():
        # Convert value to string
        value_str = str(value) if not isinstance(value, str) else value
        
        # Check byte size
        byte_size = len(value_str.encode('utf-8'))
        
        if byte_size > max_bytes:
            # Truncate to fit within limit
            truncated = value_str
            while len(truncated.encode('utf-8')) > max_bytes:
                truncated = truncated[:-10]  # Remove 10 chars at a time
            truncated += "..."  # Add ellipsis to indicate truncation
            cleaned_metadata[key] = truncated
        else:
            cleaned_metadata[key] = value_str
    
    return cleaned_metadata

In [None]:
# Upload vectors to Qdrant
from qdrant_client.http.models import PointStruct

def chunked_upload_to_qdrant(client, collection_name, vectors, batch_size=100):
    """Upload vectors to Qdrant with metadata size validation"""
    total_uploaded = 0
    
    for i in range(0, len(vectors), batch_size):
        chunk = vectors[i:i + batch_size]
        points = []
        
        for v in chunk:
            cleaned_metadata = truncate_metadata(v["metadata"])
            points.append(PointStruct(
                id=v["id"],
                vector=v["vector"],
                payload=cleaned_metadata
            ))

        try:
            client.upsert(
                collection_name=collection_name,
                points=points
            )
            total_uploaded += len(chunk)
            logger.info(f"Uploaded {total_uploaded} / {len(vectors)} vectors to Qdrant ")
        except Exception as e:
            logger.error(f"Error uploading batch {i//batch_size + 1}: {e}")
            continue

    return total_uploaded

In [None]:
#Execute migration
logger.info("Fetching vectors from Chroma...")
vectors = fetch_chroma_vectors(chroma_collection)
logger.info(f"Found {len(vectors)} vectors to migrate")

logger.info("\nUploading to Qdrant with metadata truncation...")
uploaded_count = chunked_upload_to_qdrant(qdrant_client, qdrant_collection_name, vectors)

INFO:migration:Fetching vectors from Chroma...
INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections/7375b3a7-a31a-4db7-957e-a96aafc84ce6/get "HTTP/1.1 200 OK"
INFO:migration:Fetched 100 vectors so far...
INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections/7375b3a7-a31a-4db7-957e-a96aafc84ce6/get "HTTP/1.1 200 OK"
INFO:migration:Fetched 200 vectors so far...
INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections/7375b3a7-a31a-4db7-957e-a96aafc84ce6/get "HTTP/1.1 200 OK"
INFO:migration:Fetched 300 vectors so far...
INFO:httpx:HTTP Request: POST https://api.trychroma.com:8000/api/v2/tenants/39241944-9721-46c5-8616-df7e3bc0b07a/databases/migrated_vectors/collections/7375b3a7-a31a-4db7-957e-a96aafc84

In [76]:

collection_info = qdrant_client.get_collection(qdrant_collection_name)
print(f"Migration Successful!")
print(f"Total vectors in Qdrant: {collection_info.vectors_count}")

Migration Successful!
Total vectors in Qdrant: None


In [77]:
if vectors:
    sample_vector = vectors[0]
    print(f"\nSample metadata (truncated to fit Qdrant limits):")
    cleaned_metadata = truncate_metadata(sample_vector['metadata'])
    for key, value in cleaned_metadata.items():
        byte_size = len(str(value).encode('utf-8'))
        print(f"  {key}: {byte_size} bytes - {str(value)[:50]}{'...' if len(str(value)) > 50 else ''}")
        
print(f"Successfully migrated {uploaded_count} vectors from Chroma to Qdrant!")


Sample metadata (truncated to fit Qdrant limits):
  author: 7 bytes - agimeno
  creationdate: 25 bytes - 2018-03-05T09:43:57+01:00
  creator: 21 bytes - Microsoft® Word 2016
  total_pages: 4 bytes - 11.0
  page_label: 1 bytes - 7
  producer: 21 bytes - Microsoft® Word 2016
  moddate: 25 bytes - 2018-03-12T10:24:10-04:00
  page: 3 bytes - 6.0
  source: 57 bytes - E:\AGILEFORCE\Vector Database Migration\data\resea...
  text: 246 bytes - computers for informal English learning (i.e. lear...
Successfully migrated 400 vectors from Chroma to Qdrant!
