In [1]:
# from magic_html import GeneralExtractor

# # Initialize the extractor
# extractor = GeneralExtractor()

# html = """
# <!doctype html>
# <html>
# <head>
#     <title>Example Domain</title>

#     <meta charset="utf-8" />
#     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
#     <meta name="viewport" content="width=device-width, initial-scale=1" />  
# </head>

# <body>
# <div>
#     <h1>Example Domain</h1>
#     <p>This domain is for use in illustrative examples in documents. You may use this
#     domain in literature without prior coordination or asking for permission.</p>
#     <p><a href="https://www.iana.org/domains/example">More information...</a></p>
# </div>
# </body>
# </html>
# """


# # Extract content from an article page
# data = extractor.extract(html, base_url="http://example.com/")


# print(data)

{'xp_num': 'others', 'drop_list': False, 'html': '<div><body id="readabilityplusBody">\n<div>\n    <h1>Example Domain</h1>\n    <p>This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.</p>\n    </div>\n</body>\n</div>', 'title': 'Example Domain', 'base_url': 'http://example.com/'}


In [3]:
# import requests
# from magic_html import GeneralExtractor

# # 1. Define the URL
# url = "https://www.nomadicmatt.com/travel-guides/singapore-travel-tips/"

# # 2. Fetch the raw HTML
# response = requests.get(url)
# response.raise_for_status()        # ensure we got a 200 OK
# html = response.text

# # 3. Initialize magic-html extractor
# extractor = GeneralExtractor()

# # 4. Perform extraction
# data = extractor.extract(html, base_url=url)

# print(data)


{'xp_num': '1', 'drop_list': False, 'html': '<body><div class="entry-content-wrap">\n\t\t\n\n<div class="entry-content single-content">\n\t<figure class="wp-block-image"><img fetchpriority="high" decoding="async" src="https://media.nomadicmatt.com/singaporeguide.jpg" alt="The skyline of urban Singapore, featuring skyscrapers all lit up at night"></figure><p>Singapore is one of my favorite cities in the world. It’s a foodie’s dream, bursting with tasty hawker stalls, delicious Indian food, and fresh seafood. There are hiking trails where you can stretch your legs and beaches for chilling out and soaking up the sun.</p><p>Home to around 5.7 million people, Singapore is a cosmopolitan city-state that gained independence from the British in 1965. It is now one of the world’s leading economic centers in shipping and banking.</p><p>Because of its status as a global economic hub, Singapore is expensive by Southeast Asian standards, with everything costing almost double what it does elsewhere 

In [None]:
# from bs4 import BeautifulSoup

# html_snippet = data.get("html", "")
# soup = BeautifulSoup(html_snippet, "html.parser")
# text = soup.get_text(separator="\n")

## V1

In [146]:
import requests
from urllib.parse import urljoin, urlsplit, urlunsplit
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
import hashlib
import json
import re
import logging
from typing import List, Dict, Any, Optional


class TextExtractor:
    """Handles text extraction from HTML elements."""
    @staticmethod
    def normalize_text(text: str) -> str:
        """Normalize text by lowercasing and removing extra whitespace."""
        return re.sub(r"\s+", " ", text.strip().lower())

    @staticmethod
    def get_page_summary(soup: BeautifulSoup) -> str:
        """Extract page summary from meta tags or first paragraph."""
        meta_selectors = [
            ("description", {"name": "description"}),
            ("og:description", {"property": "og:description"}),
            ("twitter:description", {"name": "twitter:description"}),
        ]

        for name, attrs in meta_selectors:
            tag = soup.find("meta", attrs)
            if tag and tag.get("content"):
                return tag["content"].strip()

        # Fallback to first substantial paragraph
        for p in soup.find_all("p"):
            text = p.get_text(strip=True)
            if len(text) > 50:
                return text
        return ""

    @staticmethod
    def get_surrounding_text(img_tag, max_chars: int = 1000) -> str:
        """Extract captions and nearby text for context."""
        contexts = []

        # Look for figure/picture with figcaption
        figure = img_tag.find_parent(["figure", "picture"])
        if figure:
            figcaption = figure.find("figcaption")
            if figcaption:
                contexts.append(figcaption.get_text(strip=True))

        # Look for caption-like siblings
        for sibling in img_tag.find_next_siblings(["p", "div", "span"], limit=3):
            text = sibling.get_text(strip=True)
            if text and any(
                keyword in text.lower()
                for keyword in ["caption", "image", "photo", "picture", "source"]
            ):
                contexts.append(text)
                break

        # Get text from previous and next elements
        def get_nearby_text(start_element, direction):
            collected = []
            current = start_element
            while current and len(" ".join(collected)) < max_chars // 2:
                if direction == "prev":
                    current = current.find_previous_sibling()
                else:
                    current = current.find_next_sibling()

                if current is None:
                    parent = start_element.parent
                    if parent and parent.name not in ["body", "html"]:
                        start_element = parent
                        current = start_element
                        continue
                    break

                if isinstance(current, NavigableString):
                    text = current.strip()
                else:
                    text = (
                        current.get_text(strip=True)
                        if current.name in ["p", "div", "h1", "h2", "h3", "h4", "h5", "h6"] else ""
                    )

                if text and len(text) > 10:
                    collected.append(text)
                    break
            return collected

        contexts = (get_nearby_text(img_tag, "prev") + contexts + get_nearby_text(img_tag, "next"))

        joined = re.sub(r"\s+", " ", " ".join(contexts))
        return joined[:max_chars] + ("…" if len(joined) > max_chars else "")


class ImageValidator:
    """Validates image URLs and filters out unwanted images."""
    # Precompiled regex patterns for ad domains
    AD_DOMAIN_PATTERNS = [
        re.compile(r"\.(doubleclick\.net|googlesyndication\.com|adservice\.google\.com|adnetwork\.com|adnxs\.com|yieldmanager\.com|pubmatic\.com|rubiconproject\.com|applovin\.com|taboola\.com|outbrain\.com|smartadserver\.com|zedo\.com|pulse3d\.com|casalemedia\.com|lijit\.com|analytics\.google\.com|connect\.facebook\.net|ads\.pinterest\.com|analytics\.twitter\.com|bat\.bing\.com|cdn\.adsafeprotected\.com|scorecardresearch\.com|quantserve\.com|moatads\.com)$", re.IGNORECASE)
    ]
    GOOD_PATH_PATTERNS = [
        re.compile(r"\b(image|img|photo|picture|media|upload|content|wp-content)\b", re.IGNORECASE)
    ]    
    BAD_PATH_PATTERNS = [
        re.compile(r"\b(placeholder|spinner|tracking|pixel|blank|spacer|clear\.gif|transparent\.png|loading|1x1\.|\.svg$|data:image/svg)\b", re.IGNORECASE)
    ]    
    GOOD_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"}

    @staticmethod
    def is_valid_image_url(url: str) -> bool:
        if not url:
            return False

        # Parse the URL to extract domain and path
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        path = parsed.path.lower()

        # Check if the domain matches any known ad/tracking domain
        for pattern in ImageValidator.AD_DOMAIN_PATTERNS:
            if pattern.search(domain):
                return False

        # Extract the file path without query params
        base_path = path.split("?", 1)[0]

        # Check for valid image extensions
        if any(base_path.endswith(ext) for ext in ImageValidator.GOOD_EXTENSIONS):
            return True

        # Fallback: check for image-related keywords in the path
        for pattern in ImageValidator.GOOD_PATH_PATTERNS:
            if pattern.search(base_path):
                # Exclude if it matches any bad pattern (e.g., tracking pixel)
                for bad_pattern in ImageValidator.BAD_PATH_PATTERNS:
                    if bad_pattern.search(base_path):
                        return False
                return True

        return False


class ImageMetadataExtractor:
    """Main class for extracting image metadata from web pages with duplicate filtering."""

    def __init__(self):
        self.text_extractor = TextExtractor()
        self.image_validator = ImageValidator()

        # Track canonical URLs we've already returned (across any pages processed
        # by this instance). This prevents duplicates in the final result.
        self._seen_urls: set[str] = set()

        # Setup logging
        logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
        self.logger = logging.getLogger(__name__)

    # ------------------------------------------------------------------
    # Helper methods
    # ------------------------------------------------------------------

    @staticmethod
    def _canonicalise_url(url: str) -> str:
        parts = urlsplit(url)
        # 1. Remove /width/height/ patterns
        path = re.sub(r"/\d{2,4}/\d{2,4}/", "/", parts.path)
        # 2. Remove "-widthxheight" in filename
        path = re.sub(r"-\d{2,4}x\d{2,4}(?=\.\w+$)", "", path)
        # 3. Remove common size-related query params
        query = re.sub(r"(\?|&)(w|width|h|height|size)=\d+", "", parts.query, flags=re.I)
        # 4. Canonicalize
        return urlunsplit((
            parts.scheme.lower(),
            parts.netloc.lower(),
            path.rstrip('/'),
            "", ""  # Ignore fragments and query
        ))


    def _fetch_page_content(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch and parse HTML content from URL."""
        try:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, "html.parser")
        except Exception as e:
            self.logger.error("Failed to fetch %s: %s", url, e)
            return None

    def _extract_image_data(self, img_tag, page_url: str, page_title: str, page_summary: str) -> Dict[str, Any]:
        """Extract metadata from a single image tag with preference for high-res images from <picture> or srcset."""
        
        def parse_srcset(srcset: str) -> List[str]:
            """Parse srcset and return list of (url, descriptor) tuples."""
            entries = [entry.strip() for entry in srcset.split(',')]
            candidates = []
            for entry in entries:
                parts = entry.strip().split()
                if parts:
                    url = parts[0]
                    descriptor = parts[1] if len(parts) > 1 else "1x"
                    candidates.append((url, descriptor))
            return candidates

        def pick_best_image(candidates: List[tuple]) -> Optional[str]:
            """Select the image with the highest resolution."""
            if not candidates:
                return None
            # Prefer descriptors with 'w' (width) over 'x' (density) for precision
            def resolution_score(descriptor):
                match = re.match(r"(\d+)(w|x)", descriptor)
                if match:
                    value, unit = match.groups()
                    return int(value)
                return 1
            sorted_candidates = sorted(candidates, key=lambda c: resolution_score(c[1]), reverse=True)
            return sorted_candidates[0][0] if sorted_candidates else None

        # Check for <picture> > <source srcset>
        picture_tag = img_tag.find_parent("picture")
        image_url = None

        if picture_tag:
            sources = picture_tag.find_all("source")
            for source in sources:
                srcset = source.get("srcset")
                if srcset:
                    candidates = parse_srcset(srcset)
                    best = pick_best_image(candidates)
                    if best:
                        image_url = urljoin(page_url, best)
                        break  # Prefer first <source> with usable srcset

        # If not from <picture>, check srcset on <img>
        if not image_url:
            srcset = img_tag.get("srcset")
            if srcset:
                candidates = parse_srcset(srcset)
                best = pick_best_image(candidates)
                if best:
                    image_url = urljoin(page_url, best)

        # Fall back to src
        if not image_url:
            src = img_tag.get("src")
            if not src:
                return {}
            image_url = urljoin(page_url, src)

        # Validate image URL
        if not self.image_validator.is_valid_image_url(image_url):
            return {}

        return {
            "image_url": image_url,
            "page_url": page_url,
            "page_title": page_title,
            "alt_text": img_tag.get("alt", "").strip(),
            "title_attribute": img_tag.get("title", "").strip(),
            "raw_caption": self.text_extractor.get_surrounding_text(img_tag),
            "page_summary": page_summary,
            "content_context": None,
            "extracted_at": datetime.utcnow().isoformat() + "Z",
        }


    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def parse_resolution_from_url(self, url):
        m = re.search(r"/(\d{2,4})/(\d{2,4})/", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        m = re.search(r"-([1-9]\d{2,4})x([1-9]\d{2,4})(?=\.\w+$)", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        return 0, 0


    def extract_from_url(self, url: str) -> List[Dict[str, Any]]:
        """Extract image metadata from a single URL, deduplicating and keeping largest variant per canonical image."""
        soup = self._fetch_page_content(url)
        if not soup:
            return []

        page_title = soup.title.get_text(strip=True) if soup.title else ""
        page_summary = self.text_extractor.get_page_summary(soup)

        # Group images by canonical URL
        grouped: Dict[str, List[Dict[str, Any]]] = {}
        for img_tag in soup.find_all("img"):
            data = self._extract_image_data(img_tag, url, page_title, page_summary)
            if not data or "image_url" not in data:
                continue

            key = self._canonicalise_url(data["image_url"])
            w, h = self.parse_resolution_from_url(data["image_url"])
            data["_w"] = w
            data["_h"] = h
            grouped.setdefault(key, []).append(data)

        results: List[Dict[str, Any]] = []
        for group in grouped.values():
            # Sort by area, keep the largest
            group_sorted = sorted(group, key=lambda d: (d["_w"] * d["_h"]), reverse=True)
            best = group_sorted[0]
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)

        return results

    def extract_from_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
        """Extract image metadata from multiple URLs, deduplicating and keeping largest variant per canonical image."""
        grouped: Dict[str, List[Dict[str, Any]]] = {}

        for url in urls:
            soup = self._fetch_page_content(url)
            if not soup:
                continue
            page_title = soup.title.get_text(strip=True) if soup.title else ""
            page_summary = self.text_extractor.get_page_summary(soup)
            for img_tag in soup.find_all("img"):
                data = self._extract_image_data(img_tag, url, page_title, page_summary)
                if not data or "image_url" not in data:
                    continue

                key = self._canonicalise_url(data["image_url"])
                w, h = self.parse_resolution_from_url(data["image_url"])
                data["_w"] = w
                data["_h"] = h
                grouped.setdefault(key, []).append(data)

        results: List[Dict[str, Any]] = []
        for group in grouped.values():
            group_sorted = sorted(group, key=lambda d: (d["_w"] * d["_h"]), reverse=True)
            best = group_sorted[0]
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)

        return results

### V1 html

In [147]:
# Example usage
if __name__ == "__main__":
    import webbrowser
    import tempfile
    import os
    
    # Create extractor instance
    extractor = ImageMetadataExtractor()
    link = "https://www.timeout.com/singapore/attractions/the-best-singapore-attractions"
    
    # Extract from single URL
    results = extractor.extract_from_url(link)
    
    print(f"\nFound {len(results)}")
    
    import html
    import webbrowser
    import tempfile
    import os

    # ---- ❶ Normalise the list of URLs ------------------------------------------
    urls = []
    for item in results:
        if isinstance(item, str):
            urls.append(item)
        elif isinstance(item, dict):
            # try the most common keys; fall back to the first value if needed
            for k in ('src', 'url', 'image_url'):
                if k in item:
                    urls.append(item[k])
                    break
            else:
                urls.append(next(iter(item.values())))  # last-ditch
    # -----------------------------------------------------------------------------


    # ---- ❷ Build the HTML -------------------------------------------------------
    html_parts = [
        '<!DOCTYPE html>',
        '<html lang="en"><head><meta charset="UTF-8">',
        '<title>Extracted Images</title>',
        '<style>',
        '  body{margin:0;font-family:system-ui,sans-serif;}',
        '  .grid{display:flex;flex-wrap:wrap;gap:12px;padding:16px;}',
        '  .grid img{max-width:320px;height:auto;border-radius:8px;',
        '             box-shadow:0 2px 6px #0003;}',
        '</style></head><body>',
        '<h1 style="text-align:center;margin:1rem 0;">Extracted images</h1>',
        '<section class="grid">'
    ]

    for u in urls:
        html_parts.append(f'  <img src="{html.escape(u)}" alt="">')

    html_parts += ['</section>', '</body></html>']
    html_text = '\n'.join(html_parts)
    # -----------------------------------------------------------------------------


    # ---- ❸ Write to a temporary file and open it --------------------------------
    with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f:
        f.write(html_text)
        file_path = f.name

    # macOS / Windows / Linux – should “just work”
    webbrowser.open('file://' + os.path.abspath(file_path))
    print(f"Opened {file_path}")



Found 28
Opened C:\Users\leeee\AppData\Local\Temp\tmp0yzyw4ka.html


  "extracted_at": datetime.utcnow().isoformat() + "Z",


## V2

In [172]:
import requests
from urllib.parse import urljoin, urlsplit, urlunsplit
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
import hashlib
import json
import re
import logging
from typing import List, Dict, Any, Optional
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
    TimeoutException,
)
from webdriver_manager.chrome import ChromeDriverManager
import time


class TextExtractor:
    """Handles text extraction from HTML elements."""
    @staticmethod
    def normalize_text(text: str) -> str:
        """Normalize text by lowercasing and removing extra whitespace."""
        return re.sub(r"\s+", " ", text.strip().lower())

    @staticmethod
    def get_page_summary(soup: BeautifulSoup) -> str:
        """Extract page summary from meta tags or first paragraph."""
        meta_selectors = [
            ("description", {"name": "description"}),
            ("og:description", {"property": "og:description"}),
            ("twitter:description", {"name": "twitter:description"}),
        ]

        for name, attrs in meta_selectors:
            tag = soup.find("meta", attrs)
            if tag and tag.get("content"):
                return tag["content"].strip()

        # Fallback to first substantial paragraph
        for p in soup.find_all("p"):
            text = p.get_text(strip=True)
            if len(text) > 50:
                return text
        return ""

    @staticmethod
    def get_surrounding_text(img_tag, max_chars: int = 1000) -> str:
        """Extract captions and nearby text for context."""
        contexts = []

        # Look for figure/picture with figcaption
        figure = img_tag.find_parent(["figure", "picture"])
        if figure:
            figcaption = figure.find("figcaption")
            if figcaption:
                contexts.append(figcaption.get_text(strip=True))

        # Look for caption-like siblings
        for sibling in img_tag.find_next_siblings(["p", "div", "span"], limit=3):
            text = sibling.get_text(strip=True)
            if text and any(
                keyword in text.lower()
                for keyword in ["caption", "image", "photo", "picture", "source"]
            ):
                contexts.append(text)
                break

        # Get text from previous and next elements
        def get_nearby_text(start_element, direction):
            collected = []
            current = start_element
            while current and len(" ".join(collected)) < max_chars // 2:
                if direction == "prev":
                    current = current.find_previous_sibling()
                else:
                    current = current.find_next_sibling()

                if current is None:
                    parent = start_element.parent
                    if parent and parent.name not in ["body", "html"]:
                        start_element = parent
                        current = start_element
                        continue
                    break

                if isinstance(current, NavigableString):
                    text = current.strip()
                else:
                    text = (
                        current.get_text(strip=True)
                        if current.name in ["p", "div", "h1", "h2", "h3", "h4", "h5", "h6"] else ""
                    )

                if text and len(text) > 10:
                    collected.append(text)
                    break
            return collected

        contexts = (get_nearby_text(img_tag, "prev")
                    + contexts
                    + get_nearby_text(img_tag, "next"))

        joined = re.sub(r"\s+", " ", " ".join(contexts))
        return joined[:max_chars] + ("…" if len(joined) > max_chars else "")


class ImageValidator:
    """Validates image URLs and filters out unwanted images."""
    AD_DOMAIN_PATTERNS = [
        re.compile(r"\.(doubleclick\.net|googlesyndication\.com|adservice\.google\.com|"
                   r"adnetwork\.com|adnxs\.com|yieldmanager\.com|pubmatic\.com|rubiconproject\.com|"
                   r"applovin\.com|taboola\.com|outbrain\.com|smartadserver\.com|zedo\.com|"
                   r"pulse3d\.com|casalemedia\.com|lijit\.com|analytics\.google\.com|"
                   r"connect\.facebook\.net|ads\.pinterest\.com|analytics\.twitter\.com|"
                   r"bat\.bing\.com|cdn\.adsafeprotected\.com|scorecardresearch\.com|"
                   r"quantserve\.com|moatads\.com)$", re.IGNORECASE)
    ]
    GOOD_PATH_PATTERNS = [
        re.compile(r"\b(image|img|photo|picture|media|upload|content|wp-content)\b",
                   re.IGNORECASE)
    ]
    BAD_PATH_PATTERNS = [
        re.compile(r"\b(placeholder|spinner|tracking|pixel|blank|spacer|clear\.gif|"
                   r"transparent\.png|loading|1x1\.|\.svg$|data:image/svg)\b",
                   re.IGNORECASE)
    ]
    GOOD_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"}

    @staticmethod
    def is_valid_image_url(url: str) -> bool:
        if not url:
            return False

        parsed = urlsplit(url)
        domain = parsed.netloc.lower()
        path = parsed.path.lower()

        for pattern in ImageValidator.AD_DOMAIN_PATTERNS:
            if pattern.search(domain):
                return False

        base_path = path.split("?", 1)[0]
        if any(base_path.endswith(ext) for ext in ImageValidator.GOOD_EXTENSIONS):
            return True

        for pattern in ImageValidator.GOOD_PATH_PATTERNS:
            if pattern.search(base_path):
                for bad_pattern in ImageValidator.BAD_PATH_PATTERNS:
                    if bad_pattern.search(base_path):
                        return False
                return True

        return False


class ImageMetadataExtractor:
    """Main class for extracting image metadata from web pages with duplicate filtering."""
    def __init__(self):
        self.text_extractor = TextExtractor()
        self.image_validator = ImageValidator()
        self._seen_urls: set[str] = set()  # dedupe across calls
        logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
        self.logger = logging.getLogger(__name__)

    @staticmethod
    def _canonicalise_url(url: str) -> str:
        parts = urlsplit(url)
        path = re.sub(r"/\d{2,4}/\d{2,4}/", "/", parts.path)
        path = re.sub(r"-\d{2,4}x\d{2,4}(?=\.\w+$)", "", path)
        query = re.sub(r"(\?|&)(w|width|h|height|size)=\d+", "", parts.query, flags=re.I)
        return urlunsplit((
            parts.scheme.lower(),
            parts.netloc.lower(),
            path.rstrip('/'),
            "", ""
        ))

    def _click_load_more(self, driver, timeout=3, max_clicks=5):
        """
        Repeatedly finds & clicks any “load more” or “show more” buttons
        until none are left or max_clicks is reached.
        """
        wait = WebDriverWait(driver, timeout)
        xpaths = [
            "//button[contains(translate(., 'LOADMORE','loadmore'), 'load more')]",
            "//button[contains(translate(., 'SHOWMORE','showmore'), 'show more')]",
            "//a[contains(translate(., 'LOADMORE','loadmore'), 'load more')]",
            "//a[contains(translate(., 'SHOWMORE','showmore'), 'show more')]",
        ]
        clicks = 0

        while clicks < max_clicks:
            found = False
            for xp in xpaths:
                try:
                    btn = wait.until(EC.element_to_be_clickable((By.XPATH, xp)))
                    driver.execute_script("arguments[0].scrollIntoView(true);", btn)
                    btn.click()
                    clicks += 1
                    found = True
                    time.sleep(1)  # let content load
                    self.logger.info("[INFO] Selenium clicked a ‘load more’ button (%s/%s)", clicks, max_clicks)
                    break
                except (TimeoutException, NoSuchElementException, StaleElementReferenceException):
                    continue
            if not found:
                break

    def _fetch_page_content(self, url: str) -> Optional[BeautifulSoup]:
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless=new")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument(
                "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
            )

            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
            driver.get(url)

            # 1) click any “load more” / “show more” buttons
            try:
                self._click_load_more(driver)
            except Exception as e:
                self.logger.debug(f"No load-more buttons or error: {e}")

            # 2) infinite scroll to trigger lazy loading
            last_height = driver.execute_script("return document.body.scrollHeight")
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height

            html = driver.page_source
            driver.quit()
            return BeautifulSoup(html, "html.parser")

        except Exception as e:
            self.logger.error("Failed to fetch %s: %s", url, e)
            return None

    def _extract_image_data(self, img_tag, page_url: str, page_title: str, page_summary: str) -> Dict[str, Any]:
        """Extract metadata from a single image tag with preference for high-res images."""
        def parse_srcset(srcset: str) -> List[tuple]:
            entries = [entry.strip() for entry in srcset.split(',')]
            candidates = []
            for entry in entries:
                parts = entry.split()
                url = parts[0]
                descriptor = parts[1] if len(parts) > 1 else "1x"
                candidates.append((url, descriptor))
            return candidates

        def pick_best_image(candidates: List[tuple]) -> Optional[str]:
            if not candidates:
                return None
            def score(d):
                m = re.match(r"(\d+)(w|x)", d)
                return int(m.group(1)) if m else 1
            best = max(candidates, key=lambda c: score(c[1]))
            return best[0]

        picture = img_tag.find_parent("picture")
        image_url = None

        if picture:
            for src in picture.find_all("source"):
                ss = src.get("srcset")
                if ss:
                    best = pick_best_image(parse_srcset(ss))
                    if best:
                        image_url = urljoin(page_url, best)
                        break

        if not image_url and img_tag.get("srcset"):
            best = pick_best_image(parse_srcset(img_tag["srcset"]))
            if best:
                image_url = urljoin(page_url, best)

        if not image_url:
            src = img_tag.get("src")
            if not src:
                return {}
            image_url = urljoin(page_url, src)

        if not self.image_validator.is_valid_image_url(image_url):
            return {}

        return {
            "image_url": image_url,
            "page_url": page_url,
            "page_title": page_title,
            "alt_text": img_tag.get("alt", "").strip(),
            "title_attribute": img_tag.get("title", "").strip(),
            "raw_caption": self.text_extractor.get_surrounding_text(img_tag),
            "page_summary": page_summary,
            "content_context": None,
            "extracted_at": datetime.utcnow().isoformat() + "Z",
        }

    def parse_resolution_from_url(self, url):
        m = re.search(r"/(\d{2,4})/(\d{2,4})/", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        m = re.search(r"-([1-9]\d{2,4})x([1-9]\d{2,4})(?=\.\w+$)", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        return 0, 0

    def extract_from_url(self, url: str) -> List[Dict[str, Any]]:
        soup = self._fetch_page_content(url)
        if not soup:
            return []

        page_title = soup.title.get_text(strip=True) if soup.title else ""
        page_summary = self.text_extractor.get_page_summary(soup)
        grouped: Dict[str, List[Dict[str, Any]]] = {}

        for img in soup.find_all("img"):
            data = self._extract_image_data(img, url, page_title, page_summary)
            if not data: continue
            key = self._canonicalise_url(data["image_url"])
            w, h = self.parse_resolution_from_url(data["image_url"])
            data["_w"], data["_h"] = w, h
            grouped.setdefault(key, []).append(data)

        results = []
        for group in grouped.values():
            best = max(group, key=lambda d: d["_w"] * d["_h"])
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)

        return results

    def extract_from_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
        all_grouped: Dict[str, List[Dict[str, Any]]] = {}

        for url in urls:
            soup = self._fetch_page_content(url)
            if not soup:
                continue
            page_title = soup.title.get_text(strip=True) if soup.title else ""
            page_summary = self.text_extractor.get_page_summary(soup)
            for img in soup.find_all("img"):
                data = self._extract_image_data(img, url, page_title, page_summary)
                if not data: continue
                key = self._canonicalise_url(data["image_url"])
                w, h = self.parse_resolution_from_url(data["image_url"])
                data["_w"], data["_h"] = w, h
                all_grouped.setdefault(key, []).append(data)

        results = []
        for group in all_grouped.values():
            best = max(group, key=lambda d: d["_w"] * d["_h"])
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)

        return results

### V2 html

In [173]:
# Example usage
if __name__ == "__main__":
    
    # Create extractor instance
    extractor = ImageMetadataExtractor()
    link = "https://www.timeout.com/singapore/attractions/the-best-singapore-attractions"
    
    
    results = extractor.extract_from_url(link)
    
    print(f"\nFound {len(results)}")


    print(results)

2025-07-05 02:43:39,911 INFO WDM - Get LATEST chromedriver version for google-chrome
2025-07-05 02:43:39,940 INFO WDM - Get LATEST chromedriver version for google-chrome
2025-07-05 02:43:39,965 INFO WDM - Driver [C:\Users\leeee\.wdm\drivers\chromedriver\win64\138.0.7204.92\chromedriver-win32/chromedriver.exe] found in cache



Found 48
[{'image_url': 'https://www.timeout.com/static/images/loading_icon.gif', 'page_url': 'https://www.timeout.com/singapore/attractions/the-best-singapore-attractions', 'page_title': '30 Best Singapore Attractions To Visit For Tourists and Locals', 'alt_text': 'Loading animation', 'title_attribute': '', 'raw_caption': 'We help you navigate a myriad of possibilities. Sign up for our newsletter for the best of the city. Déjà vu! We already have this email. Try another?', 'page_summary': "From Jewel Changi Airport to the Night Safari, there's no shortage of amazing things to do in this metropolis.", 'content_context': None, 'extracted_at': '2025-07-04T18:44:09.924132Z'}, {'image_url': 'https://media.timeout.com/images/106204458/1920/1440/image.webp', 'page_url': 'https://www.timeout.com/singapore/attractions/the-best-singapore-attractions', 'page_title': '30 Best Singapore Attractions To Visit For Tourists and Locals', 'alt_text': 'Singapore attractions', 'title_attribute': 'Singapo

  "extracted_at": datetime.utcnow().isoformat() + "Z",


## V3

In [175]:
# image_metadata_extractor_playwright.py
"""Extract high‑resolution image metadata from web pages using Playwright instead of Selenium.

Converted from a Selenium implementation to leverage Playwright's faster CDP
connection, smarter waits and cheaper parallel contexts.
"""
from __future__ import annotations

import logging
import re
import time
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, urlsplit, urlunsplit

from bs4 import BeautifulSoup, NavigableString
from playwright.sync_api import (  # type: ignore
    TimeoutError as PlaywrightTimeoutError,
    sync_playwright,
)

# -----------------------------------------------------------------------------
# Helper classes (TextExtractor & ImageValidator remain largely unchanged)
# -----------------------------------------------------------------------------


class TextExtractor:
    """Handles text extraction from HTML elements."""

    @staticmethod
    def normalize_text(text: str) -> str:
        """Normalize text by lower‑casing and collapsing whitespace."""
        return re.sub(r"\s+", " ", text.strip().lower())

    @staticmethod
    def get_page_summary(soup: BeautifulSoup) -> str:
        """Extract a short description from meta tags or the first paragraph."""
        meta_selectors = [
            ("description", {"name": "description"}),
            ("og:description", {"property": "og:description"}),
            ("twitter:description", {"name": "twitter:description"}),
        ]
        for _name, attrs in meta_selectors:
            tag = soup.find("meta", attrs)
            if tag and tag.get("content"):
                return tag["content"].strip()

        # Fallback: first non‑trivial <p>
        for p in soup.find_all("p"):
            text = p.get_text(strip=True)
            if len(text) > 50:
                return text
        return ""

    @staticmethod
    def get_surrounding_text(img_tag, max_chars: int = 1000) -> str:
        """Grab figcaptions and nearby text (prev/next sibling) for context."""

        def nearby(start, direction):
            collected: List[str] = []
            current = start
            while current and len(" ".join(collected)) < max_chars // 2:
                current = (
                    current.find_previous_sibling()
                    if direction == "prev"
                    else current.find_next_sibling()
                )
                if not current:
                    parent = start.parent
                    if parent and parent.name not in ["html", "body"]:
                        start = parent
                        current = start
                        continue
                    break

                text = (
                    current.strip()
                    if isinstance(current, NavigableString)
                    else (
                        current.get_text(strip=True)
                        if current.name in [
                            "p",
                            "div",
                            "h1",
                            "h2",
                            "h3",
                            "h4",
                            "h5",
                            "h6",
                        ]
                        else ""
                    )
                )
                if text and len(text) > 10:
                    collected.append(text)
                    break
            return collected

        contexts: List[str] = []

        figure = img_tag.find_parent(["figure", "picture"])
        if figure:
            caption = figure.find("figcaption")
            if caption:
                contexts.append(caption.get_text(strip=True))

        for sib in img_tag.find_next_siblings(["p", "div", "span"], limit=3):
            txt = sib.get_text(strip=True)
            if txt and any(k in txt.lower() for k in ["caption", "image", "photo", "picture", "source"]):
                contexts.append(txt)
                break

        contexts = nearby(img_tag, "prev") + contexts + nearby(img_tag, "next")
        joined = re.sub(r"\s+", " ", " ".join(contexts))
        return joined[:max_chars] + ("…" if len(joined) > max_chars else "")


class ImageValidator:
    """Validates image URLs and filters out ads/placeholders."""

    AD_DOMAIN_PATTERNS = [
        re.compile(
            r"\.(doubleclick\.net|googlesyndication\.com|adservice\.google\.com|"
            r"adnetwork\.com|adnxs\.com|yieldmanager\.com|pubmatic\.com|rubiconproject\.com|"
            r"applovin\.com|taboola\.com|outbrain\.com|smartadserver\.com|zedo\.com|"
            r"pulse3d\.com|casalemedia\.com|lijit\.com|analytics\.google\.com|"
            r"connect\.facebook\.net|ads\.pinterest\.com|analytics\.twitter\.com|"
            r"bat\.bing\.com|cdn\.adsafeprotected\.com|scorecardresearch\.com|"
            r"quantserve\.com|moatads\.com)$",
            re.IGNORECASE,
        )
    ]

    GOOD_PATH_PATTERNS = [
        re.compile(r"\b(image|img|photo|picture|media|upload|content|wp-content)\b", re.IGNORECASE)
    ]
    BAD_PATH_PATTERNS = [
        re.compile(
            r"\b(placeholder|spinner|tracking|pixel|blank|spacer|clear\.gif|"
            r"transparent\.png|loading|1x1\.|\.svg$|data:image/svg)\b",
            re.IGNORECASE,
        )
    ]
    GOOD_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"}

    @staticmethod
    def is_valid_image_url(url: str) -> bool:
        if not url:
            return False
        parsed = urlsplit(url)
        domain, path = parsed.netloc.lower(), parsed.path.lower()
        if any(p.search(domain) for p in ImageValidator.AD_DOMAIN_PATTERNS):
            return False
        base_path = path.split("?", 1)[0]
        if any(base_path.endswith(ext) for ext in ImageValidator.GOOD_EXTENSIONS):
            return True
        if any(p.search(base_path) for p in ImageValidator.GOOD_PATH_PATTERNS):
            if not any(bp.search(base_path) for bp in ImageValidator.BAD_PATH_PATTERNS):
                return True
        return False


# -----------------------------------------------------------------------------
# Main extractor class (Playwright‑based)
# -----------------------------------------------------------------------------


class ImageMetadataExtractor:
    """Extract image metadata from a list of URLs using Playwright."""

    def __init__(self):
        self.text_extractor = TextExtractor()
        self.image_validator = ImageValidator()
        self._seen_urls: set[str] = set()
        logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
        self.logger = logging.getLogger(__name__)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _canonicalise_url(url: str) -> str:
        parts = urlsplit(url)
        path = re.sub(r"/\d{2,4}/\d{2,4}/", "/", parts.path)
        path = re.sub(r"-\d{2,4}x\d{2,4}(?=\.\w+$)", "", path)
        query = re.sub(r"(\?|&)(w|width|h|height|size)=\d+", "", parts.query, flags=re.I)
        return urlunsplit((parts.scheme.lower(), parts.netloc.lower(), path.rstrip("/"), query, ""))

    # .................................................................
    # Playwright‑specific helpers
    # .................................................................

    def _click_load_more(self, page, timeout_ms: int = 3000, max_clicks: int = 5):
        """Click any “load more / show more” buttons until none remain."""
        clicks = 0
        selectors = [
            "button:has-text(\"load more\")",
            "button:has-text(\"show more\")",
            "a:has-text(\"load more\")",
            "a:has-text(\"show more\")",
        ]
        while clicks < max_clicks:
            found = False
            for sel in selectors:
                locator = page.locator(sel)
                try:
                    if locator.count() and locator.first.wait_for(state="visible", timeout=timeout_ms):
                        locator.first.scroll_into_view_if_needed()
                        locator.first.click()
                        clicks += 1
                        found = True
                        page.wait_for_timeout(1000)
                        self.logger.info("Playwright clicked a 'load more' button (%s/%s)", clicks, max_clicks)
                        break
                except PlaywrightTimeoutError:
                    continue
            if not found:
                break

    def _fetch_page_content(self, url: str) -> Optional[BeautifulSoup]:
        """Return BeautifulSoup of fully rendered page using Playwright."""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-gpu"])
                context = browser.new_context(
                    viewport={"width": 1920, "height": 1080},
                    user_agent=(
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/115.0.0.0 Safari/537.36"
                    ),
                )
                page = context.new_page()
                page.goto(url, wait_until="domcontentloaded", timeout=30000)

                # 1) Click expandable buttons
                self._click_load_more(page)

                # 2) Infinite scroll until height stabilises or max iterations
                last_height = -1
                for _ in range(20):  # safety cap
                    height = page.evaluate("document.body.scrollHeight")
                    if height == last_height:
                        break
                    last_height = height
                    page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                    page.wait_for_timeout(1500)

                html = page.content()
                browser.close()

            return BeautifulSoup(html, "html.parser")
        except Exception as e:
            self.logger.error("Failed to fetch %s: %s", url, e)
            return None

    # .................................................................
    # Extraction logic (unchanged except for fetch implementation)
    # .................................................................

    def _extract_image_data(
        self, img_tag, page_url: str, page_title: str, page_summary: str
    ) -> Dict[str, Any]:
        """Extract metadata from a single <img>/<picture> element."""

        def parse_srcset(srcset: str) -> List[tuple[str, str]]:
            return [tuple(map(str.strip, entry.split(" "))) if " " in entry else (entry.strip(), "1x") for entry in srcset.split(",")]

        def pick_best(candidates: List[tuple[str, str]]) -> Optional[str]:
            if not candidates:
                return None

            def score(descriptor: str) -> int:
                m = re.match(r"(\d+)(w|x)", descriptor)
                return int(m.group(1)) if m else 1

            return max(candidates, key=lambda c: score(c[1]))[0]

        picture = img_tag.find_parent("picture")
        image_url: Optional[str] = None

        # Prefer highest‑res <source srcset="…"> inside <picture>
        if picture:
            for src in picture.find_all("source"):
                ss = src.get("srcset")
                if ss:
                    best = pick_best(parse_srcset(ss))
                    if best:
                        image_url = urljoin(page_url, best)
                        break

        # Fallback: srcset on <img>
        if not image_url and img_tag.get("srcset"):
            best = pick_best(parse_srcset(img_tag["srcset"]))
            if best:
                image_url = urljoin(page_url, best)

        # Fallback: plain src
        if not image_url:
            src = img_tag.get("src")
            if not src:
                return {}
            image_url = urljoin(page_url, src)

        if not self.image_validator.is_valid_image_url(image_url):
            return {}

        return {
            "image_url": image_url,
            "page_url": page_url,
            "page_title": page_title,
            "alt_text": img_tag.get("alt", "").strip(),
            "title_attribute": img_tag.get("title", "").strip(),
            "raw_caption": self.text_extractor.get_surrounding_text(img_tag),
            "page_summary": page_summary,
            "content_context": None,
            "extracted_at": datetime.utcnow().isoformat() + "Z",
        }

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    @staticmethod
    def _parse_resolution_from_url(url: str) -> tuple[int, int]:
        m = re.search(r"/(\d{2,4})/(\d{2,4})/", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        m = re.search(r"-([1-9]\d{2,4})x([1-9]\d{2,4})(?=\.\w+$)", url)
        if m:
            return int(m.group(1)), int(m.group(2))
        return 0, 0

    def extract_from_url(self, url: str) -> List[Dict[str, Any]]:
        soup = self._fetch_page_content(url)
        if not soup:
            return []
        page_title = soup.title.get_text(strip=True) if soup.title else ""
        page_summary = self.text_extractor.get_page_summary(soup)
        grouped: Dict[str, List[Dict[str, Any]]] = {}

        for img in soup.find_all("img"):
            data = self._extract_image_data(img, url, page_title, page_summary)
            if not data:
                continue
            key = self._canonicalise_url(data["image_url"])
            w, h = self._parse_resolution_from_url(data["image_url"])
            data["_w"], data["_h"] = w, h
            grouped.setdefault(key, []).append(data)

        results: List[Dict[str, Any]] = []
        for group in grouped.values():
            best = max(group, key=lambda d: d["_w"] * d["_h"])
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)
        return results

    def extract_from_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
        all_grouped: Dict[str, List[Dict[str, Any]]] = {}
        for url in urls:
            soup = self._fetch_page_content(url)
            if not soup:
                continue
            page_title = soup.title.get_text(strip=True) if soup.title else ""
            page_summary = self.text_extractor.get_page_summary(soup)
            for img in soup.find_all("img"):
                data = self._extract_image_data(img, url, page_title, page_summary)
                if not data:
                    continue
                key = self._canonicalise_url(data["image_url"])
                w, h = self._parse_resolution_from_url(data["image_url"])
                data["_w"], data["_h"] = w, h
                all_grouped.setdefault(key, []).append(data)

        results: List[Dict[str, Any]] = []
        for group in all_grouped.values():
            best = max(group, key=lambda d: d["_w"] * d["_h"])
            best.pop("_w", None)
            best.pop("_h", None)
            results.append(best)
        return results


# -----------------------------------------------------------------------------
# Basic usage example (run directly)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    extractor = ImageMetadataExtractor()
    images = extractor.extract_from_url("https://www.timeout.com/singapore/attractions/the-best-singapore-attractions")
    for img in images:
        print(img["image_url"], "→", img.get("raw_caption", "[no caption]"))


2025-07-05 02:49:45,802 ERROR __main__ - Failed to fetch https://www.timeout.com/singapore/attractions/the-best-singapore-attractions: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.
