# AGN Health Q&A Scraper - Google Colab

Notebook สำหรับ scrape ข้อมูล Q&A จาก AGN Health Forums และบันทึกลง MongoDB Atlas

## ขั้นตอนการใช้งาน:
1. รัน Cell ติดตั้ง dependencies
2. ตั้งค่า environment variables
3. รัน scraper

---

## 1. ติดตั้ง Dependencies

In [None]:
# ติดตั้ง Chrome และ ChromeDriver
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# ติดตั้ง Python packages
!pip install selenium beautifulsoup4 pymongo webdriver-manager python-dotenv -q

## 2. กำหนดค่า Configuration

In [None]:
# MongoDB Configuration
MONGODB_URL = "mongodb+srv://natthapiw_db_user:afOJe2MrgMDsmm6k@cluster0.skadipr.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
MONGODB_DATABASE = "agn"
MONGODB_COLLECTION = "qa"

# Scraper Configuration
SCRAPER_START_ID = 1
SCRAPER_END_ID = 2675
SCRAPER_MIN_DELAY = 2
SCRAPER_MAX_DELAY = 5

BASE_URL = "https://www.agnoshealth.com/forums"

print("✅ Configuration set successfully!")
print(f"📊 Will scrape threads {SCRAPER_START_ID} to {SCRAPER_END_ID}")
print(f"🗄️  Database: {MONGODB_DATABASE}.{MONGODB_COLLECTION}")

## 3. Scraper Code

In [None]:
import logging
import time
import random
from typing import Optional, Dict
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class AGNHealthScraper:
    """Scraper for AGN Health Q&A forums."""

    def __init__(self):
        """Initialize the scraper with MongoDB connection and Selenium driver."""
        self.mongo_client = None
        self.db = None
        self.collection = None
        self.driver = None
        self._setup_mongodb()
        self._setup_selenium()

    def _setup_mongodb(self):
        """Set up MongoDB connection and ensure indexes."""
        try:
            self.mongo_client = MongoClient(MONGODB_URL)
            self.db = self.mongo_client[MONGODB_DATABASE]
            self.collection = self.db[MONGODB_COLLECTION]

            # Create unique index on thread_id to prevent duplicates
            self.collection.create_index("thread_id", unique=True)
            logger.info("MongoDB connection established successfully")
        except Exception as e:
            logger.error(f"Failed to connect to MongoDB: {e}")
            raise

    def _setup_selenium(self):
        """Set up Selenium WebDriver with Chrome."""
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

            self.driver = webdriver.Chrome(options=chrome_options)
            logger.info("Selenium WebDriver initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Selenium: {e}")
            raise

    def scrape_thread(self, thread_id: int) -> Optional[Dict]:
        """Scrape a single Q&A thread."""
        url = f"{BASE_URL}/{thread_id}"

        try:
            logger.info(f"Scraping thread {thread_id}...")
            self.driver.get(url)

            # Wait for page to load
            wait = WebDriverWait(self.driver, 10)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
            time.sleep(1)

            # Get page source and parse with BeautifulSoup
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # Extract data
            data = {
                'thread_id': thread_id,
                'date': self._extract_date(soup),
                'topic': self._extract_topic(soup),
                'question': self._extract_question(soup),
                'answer': self._extract_answer(soup)
            }

            # Validate that we have at least question or topic
            if not data['question'] and not data['topic']:
                logger.warning(f"Thread {thread_id}: No valid content found")
                return None

            logger.info(f"Thread {thread_id}: Successfully scraped")
            return data

        except TimeoutException:
            logger.warning(f"Thread {thread_id}: Timeout - page may not exist")
            return None
        except Exception as e:
            logger.error(f"Thread {thread_id}: Error during scraping - {e}")
            return None

    def _extract_date(self, soup: BeautifulSoup) -> str:
        """Extract date from the page."""
        try:
            date_elem = soup.select_one('time span.text-sm.text-gray-500')
            if date_elem:
                return date_elem.get_text(strip=True)
            time_elem = soup.select_one('time')
            if time_elem:
                return time_elem.get_text(strip=True)
            return ""
        except Exception as e:
            logger.debug(f"Date extraction error: {e}")
            return ""

    def _extract_topic(self, soup: BeautifulSoup) -> str:
        """Extract topic from the page."""
        try:
            topic_elem = soup.select_one('article p.font-bold')
            if topic_elem:
                return topic_elem.get_text(strip=True)
            topic_elem = soup.select_one('article div.flex-col p')
            if topic_elem:
                return topic_elem.get_text(strip=True)
            return ""
        except Exception as e:
            logger.debug(f"Topic extraction error: {e}")
            return ""

    def _extract_question(self, soup: BeautifulSoup) -> str:
        """Extract question from the page."""
        try:
            question_elem = soup.select_one('span.font-bold.text-lg')
            if question_elem:
                return question_elem.get_text(strip=True)
            question_div = soup.select_one('div.rounded-2xl.border.border-blue-100 span')
            if question_div:
                return question_div.get_text(strip=True)
            section = soup.select_one('section.space-y-4 span.font-bold')
            if section:
                return section.get_text(strip=True)
            return ""
        except Exception as e:
            logger.debug(f"Question extraction error: {e}")
            return ""

    def _extract_answer(self, soup: BeautifulSoup) -> str:
        """Extract answer from the page."""
        try:
            answer_parts = []
            li_elements = soup.select('section.space-y-4 ul li p')
            if li_elements:
                for elem in li_elements:
                    text = elem.get_text(strip=True)
                    if text:
                        answer_parts.append(text)

            if not answer_parts:
                p_elements = soup.select('section.space-y-4 p.mt-4')
                for elem in p_elements:
                    text = elem.get_text(strip=True)
                    if text:
                        answer_parts.append(text)

            if not answer_parts:
                p_elements = soup.select('section ul li p, section p')
                for elem in p_elements:
                    text = elem.get_text(strip=True)
                    if text and len(text) > 20:
                        answer_parts.append(text)

            return "\n\n".join(answer_parts) if answer_parts else ""
        except Exception as e:
            logger.debug(f"Answer extraction error: {e}")
            return ""

    def save_to_mongodb(self, data: Dict) -> bool:
        """Save scraped data to MongoDB."""
        try:
            self.collection.insert_one(data)
            logger.info(f"Thread {data['thread_id']}: Saved to MongoDB")
            return True
        except DuplicateKeyError:
            logger.info(f"Thread {data['thread_id']}: Already exists, skipping")
            return False
        except Exception as e:
            logger.error(f"Thread {data['thread_id']}: Failed to save - {e}")
            return False

    def scrape_all(self, start_id: int, end_id: int):
        """Scrape all threads in the specified range."""
        logger.info(f"Starting scraper for threads {start_id} to {end_id}")

        success_count = 0
        skip_count = 0
        error_count = 0

        for thread_id in range(start_id, end_id + 1):
            try:
                # Scrape the thread
                data = self.scrape_thread(thread_id)

                if data:
                    if self.save_to_mongodb(data):
                        success_count += 1
                    else:
                        skip_count += 1
                else:
                    error_count += 1

                # Add random delay
                delay = random.uniform(SCRAPER_MIN_DELAY, SCRAPER_MAX_DELAY)
                time.sleep(delay)

                # Log progress every 50 threads
                if thread_id % 50 == 0:
                    logger.info(f"Progress: {thread_id}/{end_id} - Success: {success_count}, Skipped: {skip_count}, Errors: {error_count}")

            except KeyboardInterrupt:
                logger.info("Scraping interrupted by user")
                break
            except Exception as e:
                logger.error(f"Unexpected error processing thread {thread_id}: {e}")
                error_count += 1
                continue

        logger.info(f"Scraping completed! Success: {success_count}, Skipped: {skip_count}, Errors: {error_count}")

    def close(self):
        """Clean up resources."""
        if self.driver:
            self.driver.quit()
            logger.info("Selenium driver closed")
        if self.mongo_client:
            self.mongo_client.close()
            logger.info("MongoDB connection closed")


print("✅ Scraper class loaded successfully!")

## 4. รัน Scraper

⚠️ **คำเตือน**: การ scrape 2,675 threads จะใช้เวลา 2-4 ชั่วโมง

**ตัวเลือก:**
- ถ้าต้องการทดสอบ ให้แก้ `end_id` เป็นจำนวนน้อยๆ เช่น 100
- ถ้าต้องการ scrape ทั้งหมด ใช้ `SCRAPER_END_ID`

In [None]:
# เปลี่ยนตรงนี้ถ้าต้องการทดสอบเฉพาะบาง threads
start_id = SCRAPER_START_ID  # เริ่มที่ 1
end_id = 2764  

scraper = None
try:
    print("🚀 Starting scraper...")
    print(f"📊 Scraping threads {start_id} to {end_id}")
    print("⏱️  This may take a while...\n")
    
    scraper = AGNHealthScraper()
    scraper.scrape_all(start_id, end_id)
    
    print("\n✅ Scraping completed!")
    
except Exception as e:
    print(f"❌ Error: {e}")
finally:
    if scraper:
        scraper.close()
        print("🔒 Resources cleaned up")

## 5. ตรวจสอบผลลัพธ์

In [None]:
# ตรวจสอบจำนวนข้อมูลที่บันทึกแล้ว
from pymongo import MongoClient

client = MongoClient(MONGODB_URL)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

total_docs = collection.count_documents({})
print(f"📊 Total documents in database: {total_docs}")

# แสดงตัวอย่างข้อมูล
if total_docs > 0:
    print("\n📄 Sample document:")
    sample = collection.find_one()
    if sample:
        print(f"  Thread ID: {sample.get('thread_id')}")
        print(f"  Date: {sample.get('date')}")
        print(f"  Topic: {sample.get('topic')[:50]}..." if sample.get('topic') else "  Topic: N/A")
        print(f"  Question: {sample.get('question')[:50]}..." if sample.get('question') else "  Question: N/A")
        print(f"  Answer length: {len(sample.get('answer', ''))} characters")

client.close()
print("\n✅ Verification completed!")

## 📝 หมายเหตุ

### เมื่อเสร็จแล้ว:
1. ข้อมูลจะถูกบันทึกใน MongoDB Atlas
2. ข้อมูลจะไม่ซ้ำกัน (มี unique index บน thread_id)
3. สามารถหยุดและเริ่มใหม่ได้ (จะ skip threads ที่มีอยู่แล้ว)

### ขั้นตอนต่อไป:
- รัน `colab_embedder.ipynb` เพื่อสร้าง embeddings

### Tips:
- ถ้า Colab timeout ให้รันใหม่ จะเริ่มต่อจากที่ค้างไว้
- ถ้าต้องการ scrape เร็วขึ้น ลด delay ใน configuration
- ตรวจสอบ logs เพื่อดูความคืบหน้า