# AGN Health Q&A Embedder - Google Colab

Notebook สำหรับสร้าง vector embeddings และ MongoDB Atlas Vector Search index

## ขั้นตอนการใช้งาน:
1. รัน Cell ติดตั้ง dependencies
2. ตั้งค่า environment variables
3. รัน embedder
4. สร้าง vector search index

---

## 1. ติดตั้ง Dependencies

In [None]:
# ติดตั้ง Python packages
!pip install sentence-transformers pymongo torch transformers python-dotenv -q

print("✅ Dependencies installed successfully!")

## 2. กำหนดค่า Configuration

In [None]:
# MongoDB Configuration
MONGODB_URL = "mongodb+srv://natthapiw_db_user:afOJe2MrgMDsmm6k@cluster0.skadipr.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
MONGODB_DATABASE = "agn"
MONGODB_COLLECTION = "qa"

# Embedding Configuration
EMBEDDING_MODEL = "BAAI/bge-m3"
EMBEDDING_DIMENSION = 1024

# Vector Index Configuration
VECTOR_INDEX_NAME = "vector_index"

print("✅ Configuration set successfully!")
print(f"🤖 Embedding Model: {EMBEDDING_MODEL}")
print(f"📏 Embedding Dimension: {EMBEDDING_DIMENSION}")
print(f"🗄️  Database: {MONGODB_DATABASE}.{MONGODB_COLLECTION}")

## 3. ตรวจสอบข้อมูลใน MongoDB

In [None]:
from pymongo import MongoClient

# เชื่อมต่อ MongoDB
client = MongoClient(MONGODB_URL)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

# นับจำนวนข้อมูล
total_docs = collection.count_documents({})
docs_with_embeddings = collection.count_documents({"contentVector": {"$exists": True}})
docs_without_embeddings = collection.count_documents({"contentVector": {"$exists": False}})

print(f"📊 Total documents: {total_docs}")
print(f"✅ Documents with embeddings: {docs_with_embeddings}")
print(f"⏳ Documents without embeddings: {docs_without_embeddings}")

if total_docs == 0:
    print("\n⚠️  Warning: No documents found! Please run scraper first.")
else:
    print(f"\n✅ Ready to generate embeddings for {docs_without_embeddings} documents!")

client.close()

## 4. Embedder Code

In [None]:
import logging
from typing import List, Dict
import torch
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
from pymongo.operations import UpdateOne
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class QAEmbedder:
    """Generates embeddings for Q&A documents and creates vector search index."""

    def __init__(self):
        """Initialize the embedder with MongoDB connection and embedding model."""
        self.mongo_client = None
        self.db = None
        self.collection = None
        self.embedding_model = None
        self._setup_mongodb()
        self._setup_embedding_model()

    def _setup_mongodb(self):
        """Set up MongoDB connection."""
        try:
            self.mongo_client = MongoClient(MONGODB_URL)
            self.db = self.mongo_client[MONGODB_DATABASE]
            self.collection = self.db[MONGODB_COLLECTION]
            logger.info("MongoDB connection established successfully")
        except Exception as e:
            logger.error(f"Failed to connect to MongoDB: {e}")
            raise

    def _setup_embedding_model(self):
        """Load the embedding model."""
        try:
            logger.info(f"Loading embedding model: {EMBEDDING_MODEL}")
            self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)

            # Verify embedding dimension
            test_embedding = self.embedding_model.encode("test", convert_to_numpy=True)
            actual_dim = len(test_embedding)

            logger.info(f"Embedding model loaded successfully with dimension: {actual_dim}")
        except Exception as e:
            logger.error(f"Failed to load embedding model: {e}")
            raise

    def create_combined_text(self, document: Dict) -> str:
        """Combine topic and question into a single text for embedding."""
        topic = document.get('topic', '').strip()
        question = document.get('question', '').strip()

        parts = []
        if topic:
            parts.append(f"หัวข้อ: {topic}")
        if question:
            parts.append(f"คำถาม: {question}")

        return "\n".join(parts) if parts else ""

    def generate_embedding(self, text: str) -> List[float]:
        """Generate embedding for the given text."""
        if not text:
            return [0.0] * EMBEDDING_DIMENSION

        try:
            embedding = self.embedding_model.encode(
                text,
                convert_to_numpy=True,
                normalize_embeddings=True
            )
            return embedding.tolist()
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return [0.0] * EMBEDDING_DIMENSION

    def embed_documents(self, batch_size: int = 32):
        """Generate embeddings for all documents in the collection."""
        try:
            # Count total documents
            total_docs = self.collection.count_documents({})
            logger.info(f"Found {total_docs} documents to process")

            if total_docs == 0:
                logger.warning("No documents found in collection. Run scraper first.")
                return

            # Count documents without embeddings
            docs_without_embeddings = self.collection.count_documents({
                "contentVector": {"$exists": False}
            })
            logger.info(f"Documents without embeddings: {docs_without_embeddings}")

            if docs_without_embeddings == 0:
                logger.info("All documents already have embeddings!")
                return

            # Process documents in batches
            processed = 0
            skipped = 0
            updated = 0

            # Get all documents without embeddings
            cursor = self.collection.find({"contentVector": {"$exists": False}})

            batch_docs = []
            batch_texts = []
            batch_ids = []

            for doc in cursor:
                combined_text = self.create_combined_text(doc)

                if not combined_text:
                    logger.warning(f"Document {doc['thread_id']}: Empty text, skipping")
                    skipped += 1
                    continue

                batch_docs.append(doc)
                batch_texts.append(combined_text)
                batch_ids.append(doc['_id'])

                # Process batch when it reaches batch_size
                if len(batch_texts) >= batch_size:
                    updated += self._process_batch(batch_ids, batch_texts)
                    processed += len(batch_texts)
                    logger.info(f"Progress: {processed}/{docs_without_embeddings} documents processed")

                    # Clear batch
                    batch_docs = []
                    batch_texts = []
                    batch_ids = []

            # Process remaining documents
            if batch_texts:
                updated += self._process_batch(batch_ids, batch_texts)
                processed += len(batch_texts)

            logger.info(f"Embedding completed! Processed: {processed}, Updated: {updated}, Skipped: {skipped}")

        except Exception as e:
            logger.error(f"Error during embedding process: {e}")
            raise

    def _process_batch(self, doc_ids: List, texts: List[str]) -> int:
        """Process a batch of documents and update with embeddings."""
        try:
            # Generate embeddings for the batch
            embeddings = self.embedding_model.encode(
                texts,
                convert_to_numpy=True,
                normalize_embeddings=True,
                batch_size=len(texts)
            )

            # Prepare bulk update operations
            operations = []
            for doc_id, embedding in zip(doc_ids, embeddings):
                operations.append(
                    UpdateOne(
                        {"_id": doc_id},
                        {"$set": {"contentVector": embedding.tolist()}}
                    )
                )

            # Execute bulk update
            result = self.collection.bulk_write(operations)
            return result.modified_count

        except Exception as e:
            logger.error(f"Error processing batch: {e}")
            return 0

    def verify_embeddings(self):
        """Verify that embeddings were created successfully."""
        try:
            total_docs = self.collection.count_documents({})
            docs_with_embeddings = self.collection.count_documents({
                "contentVector": {"$exists": True}
            })

            logger.info(f"Verification: {docs_with_embeddings}/{total_docs} documents have embeddings")

            if docs_with_embeddings > 0:
                # Check a sample document
                sample = self.collection.find_one({"contentVector": {"$exists": True}})
                if sample:
                    vector_length = len(sample['contentVector'])
                    logger.info(f"Sample embedding dimension: {vector_length}")

            return docs_with_embeddings == total_docs

        except Exception as e:
            logger.error(f"Error during verification: {e}")
            return False

    def close(self):
        """Clean up resources."""
        if self.mongo_client:
            self.mongo_client.close()
            logger.info("MongoDB connection closed")


print("✅ Embedder class loaded successfully!")

## 5. รัน Embedder

⚠️ **หมายเหตุ**: การสร้าง embeddings อาจใช้เวลา 10-30 นาที ขึ้นอยู่กับจำนวนข้อมูล

In [None]:
embedder = None
try:
    print("🚀 Starting embedder...")
    print("📥 Loading model and processing documents...\n")
    
    embedder = QAEmbedder()
    
    # Generate embeddings
    print("\n🤖 Generating embeddings...")
    embedder.embed_documents(batch_size=32)
    
    # Verify embeddings
    print("\n🔍 Verifying embeddings...")
    success = embedder.verify_embeddings()
    
    if success:
        print("\n✅ All documents have embeddings!")
    else:
        print("\n⚠️  Some documents are missing embeddings")
    
except Exception as e:
    print(f"❌ Error: {e}")
finally:
    if embedder:
        embedder.close()
        print("🔒 Resources cleaned up")

## 6. สร้าง Vector Search Index

⚠️ **สำคัญ**: Vector Search Index ต้องสร้างใน MongoDB Atlas UI เนื่องจาก API มีข้อจำกัด

### ขั้นตอนการสร้าง Index ใน MongoDB Atlas:

1. ไปที่ [MongoDB Atlas Console](https://cloud.mongodb.com/)
2. เลือก Cluster ของคุณ
3. ไปที่ **Search** tab
4. คลิก **Create Search Index**
5. เลือก **JSON Editor**
6. วาง configuration ด้านล่าง:

```json
{
  "mappings": {
    "dynamic": true,
    "fields": {
      "contentVector": {
        "type": "knnVector",
        "dimensions": 1024,
        "similarity": "cosine"
      }
    }
  }
}
```

7. ตั้งชื่อ index: `vector_index`
8. เลือก Database: `agn`
9. เลือก Collection: `qa`
10. คลิก **Create Search Index**
11. รอ 5-10 นาทีให้ index build เสร็จ

In [None]:
# แสดง configuration สำหรับสร้าง Vector Search Index
print("📋 Vector Search Index Configuration:")
print("="*60)
print(f"Index Name: {VECTOR_INDEX_NAME}")
print(f"Database: {MONGODB_DATABASE}")
print(f"Collection: {MONGODB_COLLECTION}")
print(f"Field: contentVector")
print(f"Type: knnVector")
print(f"Dimensions: {EMBEDDING_DIMENSION}")
print(f"Similarity: cosine")
print("="*60)
print("\nJSON Configuration:")
print("""{
  "mappings": {
    "dynamic": true,
    "fields": {
      "contentVector": {
        "type": "knnVector",
        "dimensions": 1024,
        "similarity": "cosine"
      }
    }
  }
}""")
print("\n⚠️  Please create this index manually in MongoDB Atlas UI")
print("📖 See instructions in the cell above")

## 7. ตรวจสอบผลลัพธ์สุดท้าย

In [None]:
from pymongo import MongoClient

client = MongoClient(MONGODB_URL)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

# นับจำนวนข้อมูล
total_docs = collection.count_documents({})
docs_with_embeddings = collection.count_documents({"contentVector": {"$exists": True}})

print("📊 Final Statistics:")
print("="*60)
print(f"Total documents: {total_docs}")
print(f"Documents with embeddings: {docs_with_embeddings}")
print(f"Coverage: {(docs_with_embeddings/total_docs*100):.2f}%" if total_docs > 0 else "Coverage: 0%")
print("="*60)

# แสดงตัวอย่างข้อมูลพร้อม embedding
if docs_with_embeddings > 0:
    print("\n📄 Sample document with embedding:")
    sample = collection.find_one({"contentVector": {"$exists": True}})
    if sample:
        print(f"  Thread ID: {sample.get('thread_id')}")
        print(f"  Topic: {sample.get('topic')[:50]}..." if sample.get('topic') else "  Topic: N/A")
        print(f"  Question: {sample.get('question')[:50]}..." if sample.get('question') else "  Question: N/A")
        print(f"  Embedding dimension: {len(sample['contentVector'])}")
        print(f"  First 5 values: {sample['contentVector'][:5]}")

client.close()

if docs_with_embeddings == total_docs and total_docs > 0:
    print("\n✅ All documents have embeddings! Ready for API usage.")
    print("\n📝 Next steps:")
    print("1. Create Vector Search Index in MongoDB Atlas (see section 6)")
    print("2. Run the FastAPI application (app.py) on your local machine or server")
    print("3. Test the chat endpoint at http://localhost:8001/chat")
else:
    print("\n⚠️  Not all documents have embeddings. Please check for errors above.")

## 📝 หมายเหตุ

### เมื่อเสร็จแล้ว:
1. ✅ ทุก document จะมี `contentVector` field
2. ✅ Embeddings เป็น array ขนาด 1024 dimensions
3. ✅ พร้อมสำหรับ Vector Search

### ขั้นตอนต่อไป:
1. สร้าง Vector Search Index ใน MongoDB Atlas UI (ตาม section 6)
2. รัน FastAPI application (`app.py`) บนเครื่องของคุณ
3. ทดสอบ API endpoint

### Tips:
- ถ้าหน่วยความจำไม่พอ ลด `batch_size` เป็น 16 หรือ 8
- Embeddings จะไม่ซ้ำ (มีการตรวจสอบก่อนสร้าง)
- สามารถรันใหม่ได้ (จะ skip documents ที่มี embeddings แล้ว)
- ตรวจสอบ logs เพื่อดูความคืบหน้า

### การแก้ปัญหา:
- **Out of Memory**: ลด batch_size ลงเหลือ 16 หรือ 8
- **Model download slow**: ใช้ Colab Pro หรือรอให้โหลดเสร็จ
- **MongoDB connection timeout**: ตรวจสอบ URL และ network connection