# Build Qdrant index for action cards

## Environment setup

In [1]:
!pip -q install -U qdrant-client sentence-transformers orjson


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/377.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Load action cards

In [17]:
from pathlib import Path
import orjson

CARDS_PATH = Path("/content/drive/MyDrive/VibeQ-EIE/llmdata/RAG/action_cards_27.jsonl")  # update if yours is in Drive

cards = []
with CARDS_PATH.open("rb") as f:
    for line in f:
        if not line.strip():
            continue
        cards.append(orjson.loads(line))

len(cards), list(cards[0].keys())


(27,
 ['doc_type',
  'id',
  'title',
  'summary',
  'action_type',
  'time_minutes',
  'effort',
  'setting',
  'privacy',
  'risk_level',
  'emotion_targets',
  'when_to_use',
  'when_to_avoid',
  'steps',
  'micro_script',
  'embedding_version',
  'embedding_language',
  'embedding_text'])

## Build embedding text

In [18]:
def make_embedding_text(card: dict) -> str:
    # keep it deterministic and “semantic”
    title = card.get("title", "")
    when = " | ".join(card.get("when_to_use", []) or [])
    avoid = " | ".join(card.get("when_to_avoid", []) or [])
    steps = " | ".join(card.get("steps", []) or [])
    setting = " | ".join(card.get("setting", []) or [])
    action_type = " | ".join(card.get("action_type", []) or [])
    time_m = card.get("time_minutes", [])
    time_str = f"{time_m}" if time_m is not None else ""
    targets = card.get("emotion_targets", {}) or {}
    targets_str = ", ".join([f"{k}:{round(float(v),2)}" for k,v in targets.items()])

    return (
        f"TITLE: {title}\n"
        f"ACTION_TYPE: {action_type}\n"
        f"SETTING: {setting}\n"
        f"TIME_MINUTES: {time_str}\n"
        f"WHEN_TO_USE: {when}\n"
        f"WHEN_TO_AVOID: {avoid}\n"
        f"EMOTION_TARGETS: {targets_str}\n"
        f"STEPS: {steps}\n"
    ).strip()

for c in cards:
    c["embedding_text"] = make_embedding_text(c)


## Load embedding model

In [19]:
from sentence_transformers import SentenceTransformer

EMB_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(EMB_MODEL)

# quick check
vec = embedder.encode(["hello world"], normalize_embeddings=True)
vec.shape


(1, 384)

## Initialize Qdrant collection

In [20]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

QDRANT_PATH = "/content/drive/MyDrive/VibeQ-EIE/DB/qdrant_db"   # persists during runtime; copy to Drive if you want
COLLECTION = "action_cards"

client = QdrantClient(path=QDRANT_PATH)

# recreate collection
client.recreate_collection(
    collection_name=COLLECTION,
    vectors_config=VectorParams(
        size=embedder.get_sentence_embedding_dimension(),
        distance=Distance.COSINE,
    )
)


  client.recreate_collection(


True

## Encode and upsert vectors

In [21]:
from qdrant_client.http.models import PointStruct
import numpy as np

texts = [c["embedding_text"] for c in cards]
vectors = embedder.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)

points = []
for i, c in enumerate(cards):
    payload = dict(c)
    payload["_string_id"] = c.get("id", f"card_{i}")  # keep original string id
    points.append(PointStruct(
        id=i,  # ✅ int IDs are valid in local Qdrant
        vector=vectors[i].tolist() if isinstance(vectors[i], np.ndarray) else list(vectors[i]),
        payload=payload
    ))

client.upsert(collection_name=COLLECTION, points=points)
print("Count:", client.count(collection_name=COLLECTION, exact=True).count)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Count: 27


## Test semantic query

In [22]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

query = "I feel overwhelmed and panicky, my heart is racing and I can't focus."
qvec = embedder.encode([query], normalize_embeddings=True)[0].tolist()

hits = client.query_points(
    collection_name=COLLECTION,
    query=qvec,
    limit=5,
).points

[(h.id, round(h.score, 3), h.payload.get("title")) for h in hits]




[(0, 0.327, 'Box Breathing (4×4)'),
 (2, 0.314, 'Worry Postponement (10-minute window)'),
 (1, 0.271, '5-4-3-2-1 Grounding'),
 (23, 0.264, 'Curiosity Prompt (What’s True Right Now?)'),
 (6, 0.255, 'Micro-Activation (2-minute start)')]