In [2]:
# --- Sample 50 owners from "owner_agent_listings" and copy to "sampled_owner_agent_listings" ---

import random
from typing import List
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, Record
from qdrant_client.http import exceptions as qexc

# ---------- Config ----------
QDRANT_URL  = "https://3cf2848d-0574-468d-a996-0efabdea92b9.us-west-1-0.aws.cloud.qdrant.io"
QDRANT_KEY  = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.79h_Yg9qXYtICf-fs1CMuMdK5Rw13OnE_DJR953fYQ4"

SOURCE_COLLECTION = "owner_agent_listings"
TARGET_COLLECTION = "sampled_owner_agent_listings12-50"
SAMPLE_SIZE       = 50
ID_SCROLL_BATCH   = 1000
RANDOM_SEED       = 12
RECREATE_TARGET   = True  # set False if you want to append without dropping

# ---------- Connect ----------
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_KEY)

def scroll_all_ids(collection: str, batch: int = 1000) -> List[str]:
    ids = []
    next_page = None
    while True:
        recs, next_page = client.scroll(
            collection_name=collection,
            with_payload=False,
            with_vectors=False,
            limit=batch,
            offset=next_page,
        )
        ids.extend(str(r.id) for r in recs)
        if next_page is None:
            break
    return ids

def retrieve_with_vectors(collection: str, ids: List[str]) -> List[Record]:
    return client.retrieve(
        collection_name=collection,
        ids=ids,
        with_payload=True,
        with_vectors=True,
    )

def recreate_collection_like(target: str, vector_size: int, distance=Distance.COSINE):
    # Drop if exists (optional)
    try:
        if RECREATE_TARGET:
            client.delete_collection(collection_name=target)
    except qexc.UnexpectedResponse:
        pass
    # Create fresh
    try:
        client.get_collection(collection_name=target)
    except qexc.UnexpectedResponse:
        client.create_collection(
            collection_name=target,
            vectors_config=VectorParams(size=vector_size, distance=distance),
        )

# ---------- 1) Get ALL owner IDs (cheap) ----------
owner_ids = scroll_all_ids(SOURCE_COLLECTION, batch=ID_SCROLL_BATCH)
total = len(owner_ids)
if total == 0:
    raise RuntimeError("Source collection has no points.")

print(f"Found {total} owner IDs in '{SOURCE_COLLECTION}'.")

# ---------- 2) Sample 50 deterministically ----------
random.Random(RANDOM_SEED).shuffle(owner_ids)
sample_ids = owner_ids[:min(SAMPLE_SIZE, total)]
print(f"Sampling {len(sample_ids)} owners.")

# ---------- 3) Retrieve sampled records with payload + vectors ----------
records = retrieve_with_vectors(SOURCE_COLLECTION, sample_ids)
if not records:
    raise RuntimeError("Failed to retrieve sampled records with vectors.")

# Infer vector size from first record
first_vec = records[0].vector
if first_vec is None:
    raise RuntimeError("Sampled record has no vector; cannot infer vector size.")
try:
    vector_size = len(first_vec)  # single-vector collection
except TypeError:
    # If your collection uses named vectors (dict), pick the first key
    if isinstance(first_vec, dict) and first_vec:
        first_key = next(iter(first_vec))
        vector_size = len(first_vec[first_key])
    else:
        raise

print(f"Inferred vector size: {vector_size}")

# ---------- 4) Create target collection (fresh) ----------
recreate_collection_like(TARGET_COLLECTION, vector_size=vector_size, distance=Distance.COSINE)

# ---------- 5) Upsert sampled points ----------
points = []
for r in records:
    pid = str(r.id)
    vec = r.vector
    payload = r.payload or {}
    # Ensure we pass a plain list vector (or the same structure your source uses)
    points.append(PointStruct(id=pid, vector=vec, payload=payload))

client.upsert(collection_name=TARGET_COLLECTION, points=points, wait=True)
print(f"✅ Upserted {len(points)} points into '{TARGET_COLLECTION}'.")

# ---------- 6) Verify counts ----------
try:
    src_count = client.count(collection_name=SOURCE_COLLECTION, exact=True).count
    tgt_count = client.count(collection_name=TARGET_COLLECTION, exact=True).count
    print(f"Source count: {src_count} | Target count: {tgt_count}")
except Exception:
    pass


Found 1012 owner IDs in 'owner_agent_listings'.
Sampling 50 owners.
Inferred vector size: 1536
✅ Upserted 50 points into 'sampled_owner_agent_listings12-50'.
Source count: 1012 | Target count: 50
