In [1]:
# Import the Pinecone library
from pinecone import Pinecone
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec 
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
import os
# Access environment variables as if they came from the actual environment
PINECONE_API = os.getenv('PINECONE')
HUGGING_FACE_API_TOKEN = os.getenv('HUGGING_FACE_API_TOKEN')

In [3]:
import pinecone as pc

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=PINECONE_API)

pc.describe_index(name="db")

{
    "name": "db",
    "metric": "cosine",
    "host": "db-vv4b40r.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [4]:
index = pc.Index("db")  # your existing index

all_ids = []
for page in index.list(limit=100, namespace=None):  # adjust namespace as needed
    all_ids.extend(page)
print(f"Found {len(all_ids)} vector IDs.")
def fetch_vectors(index, ids, batch_size=100):
    all_vectors = []
    for i in range(0, len(ids), batch_size):
        batch_ids = ids[i:i + batch_size]
        response = index.fetch(ids=batch_ids)

        for vid, vector in response.vectors.items():
            all_vectors.append({
                "id": vid,
                "vector": vector.values,
                "metadata": vector.metadata or {}
            })
    return all_vectors

vectors = fetch_vectors(index, all_ids)


Found 100 vector IDs.


In [5]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
import os
from dotenv import load_dotenv
from tqdm import tqdm
import logging

# Basic logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load environment variables and initialize Qdrant Cloud client
load_dotenv()
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
if not qdrant_url or not qdrant_api_key:
    raise ValueError("set QDRANT_URL and QDRANT_API_KEY in .env file")


In [6]:
# Initialize Qdrant client for Qdrant Cloud
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True 
)
logger.info("Qdrant Cloud client initialized")

INFO:__main__:Qdrant Cloud client initialized


In [7]:
def migrate_to_qdrant(vectors, collection_name="vector_db", vector_dimension=384, batch_size=100):
    """
    Migrate embeddings to Qdrant Cloud.

    Args:
        vectors: List of dicts with 'id', 'vector', and 'metadata'.
        collection_name: Qdrant collection name.
        vector_dimension: Vector dimension (default: 384).
        batch_size: Number of points to upsert per batch.
    """
    # Create collection if it doesn't exist
    if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE)
        )
        logger.info(f"Created collection: {collection_name}")

    # Prepare points
    points = [
        PointStruct(
            id=str(v["id"]),
            vector=v["vector"],
            payload=v["metadata"]
        )
        for v in vectors if len(v["vector"]) == vector_dimension
    ]
    if not points:
        raise ValueError("No valid vectors to migrate")

    # Upsert points in batches with progress bar
    for i in tqdm(range(0, len(points), batch_size), desc="Migrating vectors"):
        qdrant_client.upsert(collection_name=collection_name, points=points[i:i + batch_size])

    # Verify migration
    count = qdrant_client.get_collection(collection_name).points_count
    logger.info(f"Migrated {count} vectors to Qdrant")

    # Sample query to verify
    if points:
        query_vector = points[0].vector
        results = qdrant_client.query_points(
            collection_name=collection_name,
            query=query_vector,
            limit=3
        ).points
        logger.info("Sample query results:")
        for r in results:
            logger.info(f"ID: {r.id}, Score: {r.score}, Metadata: {r.payload}")

In [9]:
migrate_to_qdrant(vectors=vectors, collection_name="vector_db", vector_dimension=384)

Migrating vectors: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
INFO:__main__:Migrated 100 vectors to Qdrant
INFO:__main__:Sample query results:
INFO:__main__:ID: 0f09e208-4b1d-4a60-96d4-ae7513bcc121, Score: 1.0000001192092896, Metadata: {'creator': 'Microsoft® Word 2016', 'moddate': '2018-03-12T10:24:10-04:00', 'total_pages': 11.0, 'creationdate': '2018-03-05T09:43:57+01:00', 'producer': 'Microsoft® Word 2016', 'source': 'E:\\AGILEFORCE\\Vector Database Migration\\data\\research.pdf', 'author': 'agimeno', 'page': 3.0, 'page_label': '4', 'text': 'was chosen intentionally since it uses a set of prepared in advance guiding questions \nand prompts and interviewees are encouraged to elaborate on the problems raised \nduring it (Dörnyei, 2007). As Dörnyei (2007) explains, in this type of the interview “the \ninterviewer provides guidelines and direction (hence the ‘ -structured’ part in the name), \nbut is also keen to follow up interesting developments and to let the interviewee \nelabora

In [10]:
from qdrant_client import QdrantClient
import json

# --- Qdrant Config ---
QDRANT_HOST = qdrant_url
QDRANT_PORT = 6333
COLLECTION_NAME = "vector_db"
BATCH_SIZE = 100

# Store all vectors here
qdrant_vectors = []
client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
    prefer_grpc=True 
)

scroll_offset = None
i =1

while True:
    points, next_page_offset = client.scroll(
        collection_name=COLLECTION_NAME,
        limit=BATCH_SIZE,
        offset=scroll_offset,         # use scroll_offset, not fixed offset
        with_vectors=True,
        with_payload=True
    )

    if not points:
        break

    qdrant_vectors.extend([
        {
            "id": str(point.id),
            "values": point.vector,
            "metadata": point.payload or {}
        } for point in points
    ])

    # ✅ this is what keeps pagination working
    scroll_offset = next_page_offset

    if scroll_offset is None:  # end of collection
        break
    
qdrant_vectors

INFO:httpx:HTTP Request: GET https://a39cabe9-e04e-41bc-bfc8-c35e32b99b11.eu-west-2-0.aws.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"


[{'id': '00e7a53d-203c-4431-83da-f0e9eff495fa',
  'values': [-0.03925754874944687,
   0.008914606645703316,
   0.07649470865726471,
   -0.041093189269304276,
   0.0037982575595378876,
   -0.02512751892209053,
   0.08557821065187454,
   0.008515381254255772,
   0.001276019960641861,
   0.028080977499485016,
   0.07577931135892868,
   0.04818679019808769,
   0.002929170150309801,
   0.016215255483984947,
   0.09005613625049591,
   -0.036772552877664566,
   -0.03175448626279831,
   -0.04654426872730255,
   -0.017089277505874634,
   0.010164828039705753,
   -0.017039984464645386,
   0.02772396057844162,
   0.045789916068315506,
   -0.003539042314514518,
   0.056978702545166016,
   -0.030672401189804077,
   -0.01612294279038906,
   -0.06585817039012909,
   0.07240559160709381,
   0.02503957413136959,
   -0.005910573527216911,
   0.13024060428142548,
   0.0573824904859066,
   0.06281797587871552,
   -0.011233333498239517,
   -0.03089291974902153,
   0.012108626775443554,
   -0.01819775067269

In [11]:

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=PINECONE_API)

# Create a dense index with integrated embedding only if it doesn't exist
index_name = "qdrant"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )



In [12]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API)
INDEX_NAME = "qdrant"
DIM = len(qdrant_vectors[0]["values"])

# Create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric="cosine",  # or 'dotproduct', etc.
        spec=ServerlessSpec(
            cloud="aws",         # or 'gcp'
            region="us-east-1"   # match your Pinecone project region
        )
    )

index = pc.Index(INDEX_NAME)