In [None]:
# Install Dependencies

!pip install -q sentence-transformers pymongo python-dotenv tqdm

In [None]:
# Import Dependencies

from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
from dotenv import load_dotenv
from tqdm import tqdm
import os

In [None]:
# Load the environment variables

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DATABASE_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

if not all([MONGO_URI, DB_NAME, COLLECTION_NAME]):
    raise ValueError("Missing environment variables. Check your .env file")

In [None]:
# Connect to MongoDB

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

print(f"Connected to MongoDB!")

In [None]:
# Load embedding model

model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded successfully")

In [None]:
# Load a sample dataset

dataset = [
    "Artificial intelligence is transforming the healthcare industry",
    "Machine learning models are improving risk assessment",
    "Adaptive learning systems personalize education"
]

print(f"Loaded {len(dataset)} documents")

In [None]:
# Embed the dataset and store into the vectorDB

for text in tqdm(dataset, desc="Uploading to MongoDB"):
    embedding = model.encode(text, normalize_embeddings=True).tolist()

    record = {
        "text": text,
        "embedding": embedding
    }

    collection.update_one({"text": text}, {"$set": record}, upsert=True)
    
print("All documents embedded and uploaded successfully!")

In [None]:
# Get Vector Embedding
query_text = "Artificial intelligence"
query_embedding = model.encode(query_text).tolist()

# Run vector search
pipeline = [
    {
        "$vectorSearch": {
            "queryVector": query_embedding,
            "path": "embedding",
            "numCandidates": 100,
            "limit": 3,
            "index": "vector_index",  # use your actual index name if different
            "similarity": "cosine"
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(collection.aggregate(pipeline))

# Print the top matches
print("🔍 Top Retrieved Results:")
for r in results:
    print(f"Score: {r['score']:.4f} | Text: {r['text'][:150]}...")