In [2]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.vectorstores import Qdrant as QdrantVectorStore
from langchain_huggingface import HuggingFaceEndpointEmbeddings
import os
from dotenv import load_dotenv
import chromadb
from chromadb import Client as ChromaClient
from chromadb.config import Settings

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
load_dotenv

In [10]:

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
if not qdrant_url or not qdrant_api_key:
    raise ValueError("set QDRANT_URL and QDRANT_API_KEY in .env file")


In [11]:
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
chroma_api_key = os.getenv("CHROMA_API_KEY")
chroma_tenant = os.getenv("CHROMA_tenant")
HUGGING_FACE_API_TOKEN = os.getenv("HUGGING_FACE_API_TOKEN")

In [12]:
import logging
logger = logging.getLogger("migration")


# Initialize Qdrant client for Qdrant Cloud
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True 
)
logger.info("Qdrant Cloud client initialized")

In [13]:
collection_name="vector_db"
vector_dimension = 384

In [14]:
    # Create collection if it doesn't exist
if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE)
        )
logger.info(f"Created collection: {collection_name}")

In [18]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

def text_split(extracted_Data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunk = text_splitter.split_documents(extracted_Data)
    return text_chunk

extracted = load_pdf(data='E:\AI\AgileForce\Vector_db\Vector-Database-Migration-Open-Source-Tool\data')
test_chunks = text_split(extracted)


  extracted = load_pdf(data='E:\AI\AgileForce\Vector_db\Vector-Database-Migration-Open-Source-Tool\data')


FileNotFoundError: Directory not found: 'E:\AI\AgileForce\Vector_db\Vector-Database-Migration-Open-Source-Tool\data'

In [16]:
def download():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    return HuggingFaceEndpointEmbeddings(
        model=model_name,
        huggingfacehub_api_token=HUGGING_FACE_API_TOKEN
    )

embeddings = download()


In [17]:
# Embed and upsert to qdrant
qdrant_store = QdrantVectorStore.from_documents(
    documents=test_chunks,
    embedding=embeddings,
    collection_name=collection_name,
    url=qdrant_url,
    api_key=qdrant_api_key,
)


NameError: name 'test_chunks' is not defined

In [None]:
from chromadb import PersistentClient

chroma = PersistentClient(path="./chroma_store")  
chroma_collection = chroma.get_or_create_collection(name="migrated_vectors")


In [None]:
chroma_client= chromadb.CloudClient(
  api_key='ck-83UY8Q48K22Ac8U8SCsMCYz27fPdxg4Er73BGkBm5E8T',
  tenant=chroma_tenant,
  database='migrated_vectors'
)

In [None]:
collection = chroma_client.get_or_create_collection(
    name="migrated_vectors",  
)


In [None]:
def fetch_qdrant_vectors(client, collection_name, batch_size=100):
    all_vectors = []
    next_offset = None

    while True:
        points, next_offset = client.scroll(
            collection_name=collection_name,
            offset=next_offset,
            limit=batch_size,
            with_payload=True,
            with_vectors=True
        )
        if not points:
            break

        for pt in points:
            all_vectors.append({
                "id": str(pt.id),
                "vector": pt.vector,
                "metadata": pt.payload or {}
            })

        if next_offset is None:
            break
    return all_vectors


In [None]:
#Upload to Chroma Cloud with Metadata Truncation

def truncate_metadata(metadata, max_bytes=250):
    """Truncate metadata values to fit within Chroma's byte limits"""
    cleaned_metadata = {}
    
    for key, value in metadata.items():
        # Convert value to string
        value_str = str(value) if not isinstance(value, str) else value
        
        # Check byte size
        byte_size = len(value_str.encode('utf-8'))
        
        if byte_size > max_bytes:
            # Truncate to fit within limit
            truncated = value_str
            while len(truncated.encode('utf-8')) > max_bytes:
                truncated = truncated[:-10]  # Remove 10 chars at a time
            truncated += "..."  # Add ellipsis to indicate truncation
            cleaned_metadata[key] = truncated
        else:
            cleaned_metadata[key] = value_str
    
    return cleaned_metadata

def chunked_upload(collection, vectors, batch_size=100):
    """Upload vectors with metadata size validation"""
    total_uploaded = 0
    
    for i in range(0, len(vectors), batch_size):
        chunk = vectors[i:i + batch_size]
        ids = [v["id"] for v in chunk]
        embeddings = [v["vector"] for v in chunk]
        
        # Clean metadata to fit Chroma limits
        metadatas = [truncate_metadata(v["metadata"]) for v in chunk]

        try:
            collection.add(
                ids=ids,
                embeddings=embeddings,
                metadatas=metadatas
            )
            total_uploaded += len(chunk)
            print(f"Uploaded {total_uploaded} / {len(vectors)} vectors ✅")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
            continue

    return total_uploaded



In [None]:
# Fetch vectors from Qdrant and upload to Chroma
print("Fetching vectors from Qdrant...")
vectors = fetch_qdrant_vectors(qdrant_client, collection_name)
print(f"Found {len(vectors)} vectors to migrate")

print("\nUploading to Chroma Cloud with metadata truncation...")
uploaded_count = chunked_upload(collection, vectors)

In [None]:
results = collection.query(
    query_embeddings=[vectors[0]["vector"]],
    n_results=3
)
print(results)


In [None]:
# Check metadata sizes to identify the problem

def check_metadata_sizes(vectors):
    """Check metadata sizes and identify problematic entries"""
    problematic_entries = []
    
    for i, vector in enumerate(vectors[:10]):  # Check first 10 for analysis
        metadata = vector.get("metadata", {})
        
        for key, value in metadata.items():
            # Convert to string to measure byte size
            value_str = str(value) if not isinstance(value, str) else value
            byte_size = len(value_str.encode('utf-8'))
            
            print(f"Vector {i}, Key: '{key}', Size: {byte_size} bytes")
            
            if byte_size > 256:
                problematic_entries.append({
                    'vector_index': i,
                    'key': key,
                    'size': byte_size,
                    'value_preview': value_str[:100] + "..." if len(value_str) > 100 else value_str
                })
    
    return problematic_entries

# Check the metadata sizes
problematic = check_metadata_sizes(vectors)
print(f"\nFound {len(problematic)} problematic metadata entries:")
for entry in problematic:
    print(f"Vector {entry['vector_index']}: '{entry['key']}' = {entry['size']} bytes")
    print(f"Preview: {entry['value_preview']}\n")

In [None]:

collection_info = collection.get()
print(f" Migration Successful!")
print(f"Total vectors in Chroma: {len(collection_info['ids'])}")

# Show a sample of the truncated metadata
if collection_info['metadatas']:
    sample_metadata = collection_info['metadatas'][0]
    print(f"\nSample metadata (truncated to fit Chroma limits):")
    for key, value in sample_metadata.items():
        byte_size = len(str(value).encode('utf-8'))
        print(f"  {key}: {byte_size} bytes - {str(value)[:50]}{'...' if len(str(value)) > 50 else ''}")
        
print(f" Successfully migrated {len(collection_info['ids'])} vectors from Qdrant to Chroma!")

Chroma to Qdrant

In [None]:

if not qdrant_url or not qdrant_api_key:
    raise ValueError("Set QDRANT_URL and QDRANT_API_KEY in .env file")
if not chroma_api_key or not chroma_tenant:
    raise ValueError("Set CHROMA_API_KEY, CHROMA_TENANT, and CHROMA_DATABASE in .env file")

In [None]:
# Cell 3: Setup logging and initialize clients
logger = logging.getLogger("migration")
logging.basicConfig(level=logging.INFO)

In [None]:
# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True
)
logger.info("Qdrant Cloud client initialized")

In [None]:
# Cell 4: Setup collections
chroma_collection_name = "migrated_vectors"
qdrant_collection_name = "chroma_to_qdrant"
vector_dimension = 384

In [None]:
# Create Qdrant collection if it doesn't exist
if not qdrant_client.collection_exists(qdrant_collection_name):
    qdrant_client.create_collection(
        collection_name=qdrant_collection_name,
        vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE)
    )
logger.info(f"Created Qdrant collection: {qdrant_collection_name}")

In [None]:
# Get Chroma collection
chroma_collection = chroma_client.get_or_create_collection(name=chroma_collection_name)
logger.info(f"Accessed Chroma collection: {chroma_collection_name}")


In [None]:
# Fetch vectors from Chroma
def fetch_chroma_vectors(collection, batch_size=100):
    """Fetch all vectors from Chroma collection"""
    all_vectors = []
    offset = 0

    while True:
        results = collection.get(
            include=["embeddings", "metadatas"],
            limit=batch_size,
            offset=offset
        )
        
        if not results['ids']:
            break

        for i in range(len(results['ids'])):
            all_vectors.append({
                "id": results['ids'][i],
                "vector": results['embeddings'][i],
                "metadata": results['metadatas'][i] or {}
            })

        offset += batch_size
        logger.info(f"Fetched {len(all_vectors)} vectors so far...")

    return all_vectors

In [None]:
# Metadata truncation function
def truncate_metadata(metadata, max_bytes=250):
    """Truncate metadata values to fit within Qdrant's byte limits"""
    cleaned_metadata = {}
    
    for key, value in metadata.items():
        # Convert value to string
        value_str = str(value) if not isinstance(value, str) else value
        
        # Check byte size
        byte_size = len(value_str.encode('utf-8'))
        
        if byte_size > max_bytes:
            # Truncate to fit within limit
            truncated = value_str
            while len(truncated.encode('utf-8')) > max_bytes:
                truncated = truncated[:-10]  # Remove 10 chars at a time
            truncated += "..."  # Add ellipsis to indicate truncation
            cleaned_metadata[key] = truncated
        else:
            cleaned_metadata[key] = value_str
    
    return cleaned_metadata

In [None]:
# Upload vectors to Qdrant
from qdrant_client.http.models import PointStruct

def chunked_upload_to_qdrant(client, collection_name, vectors, batch_size=100):
    """Upload vectors to Qdrant with metadata size validation"""
    total_uploaded = 0
    
    for i in range(0, len(vectors), batch_size):
        chunk = vectors[i:i + batch_size]
        points = []
        
        for v in chunk:
            cleaned_metadata = truncate_metadata(v["metadata"])
            points.append(PointStruct(
                id=v["id"],
                vector=v["vector"],
                payload=cleaned_metadata
            ))

        try:
            client.upsert(
                collection_name=collection_name,
                points=points
            )
            total_uploaded += len(chunk)
            logger.info(f"Uploaded {total_uploaded} / {len(vectors)} vectors to Qdrant ")
        except Exception as e:
            logger.error(f"Error uploading batch {i//batch_size + 1}: {e}")
            continue

    return total_uploaded

In [None]:
#Execute migration
logger.info("Fetching vectors from Chroma...")
vectors = fetch_chroma_vectors(chroma_collection)
logger.info(f"Found {len(vectors)} vectors to migrate")

logger.info("\nUploading to Qdrant with metadata truncation...")
uploaded_count = chunked_upload_to_qdrant(qdrant_client, qdrant_collection_name, vectors)

In [None]:

collection_info = qdrant_client.get_collection(qdrant_collection_name)
print(f"Migration Successful!")
print(f"Total vectors in Qdrant: {collection_info.vectors_count}")

In [None]:
if vectors:
    sample_vector = vectors[0]
    print(f"\nSample metadata (truncated to fit Qdrant limits):")
    cleaned_metadata = truncate_metadata(sample_vector['metadata'])
    for key, value in cleaned_metadata.items():
        byte_size = len(str(value).encode('utf-8'))
        print(f"  {key}: {byte_size} bytes - {str(value)[:50]}{'...' if len(str(value)) > 50 else ''}")
        
print(f"Successfully migrated {uploaded_count} vectors from Chroma to Qdrant!")