In [1]:
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup, Tag
from dateutil import parser as dateparser
from typing import Any, Dict, List, Optional

### Class to scrape the XML links and to collate the URLs

In [None]:
# from __future__ import annotations

# import asyncio
# import re
# from dataclasses import dataclass
# from datetime import datetime, timezone
# from pathlib import Path
# from typing import Iterable, List

# import httpx                     # pip install httpx
# from bs4 import BeautifulSoup     # pip install beautifulsoup4 lxml

# MONTH_FEED_RE = re.compile(r"/(\d{4})/(\d{2})/feeds\.xml$")   # keep only YYYY/MM feeds

# @dataclass
# class LinkExtractor:
#     index_url: str
#     out_dir: Path | str = "/home/leeeefun681/volume/eefun/webscraping/sitemap/sitemap_scrape/data/straitsTimes/st_sitemaps"
#     timeout: float = 15.0
#     polite_delay: float = 1.0
#     max_concurrency: int = 5

#     # ────────────────────────── public helpers ──────────────────────────────
#     async def dump_async(self) -> None:
#         """Asynchronously download every past month and save to .txt files."""
#         self.out_dir = Path(self.out_dir)
#         self.out_dir.mkdir(parents=True, exist_ok=True)

#         async with httpx.AsyncClient(timeout=self.timeout) as client:
#             month_feeds = await self._sitemap_links(client)

#             if not month_feeds:
#                 print("No month feeds found (or all filtered out).")
#                 return

#             sem = asyncio.Semaphore(self.max_concurrency)
#             tasks = [
#                 asyncio.create_task(self._process_month(feed_url, client, sem))
#                 for feed_url in month_feeds
#             ]
#             await asyncio.gather(*tasks)

#     def dump(self):
#         try:
#             loop = asyncio.get_running_loop()
#         except RuntimeError:           # no loop → we're in a vanilla script
#             loop = None

#         if loop and loop.is_running():
#             # notebook / web-server context → create and return a Task
#             return asyncio.create_task(self.dump_async())
#         else:
#             # classic script → safe to spin up a fresh loop
#             asyncio.run(self.dump_async())

#     # ────────────────────────── internals ───────────────────────────────────
#     async def _sitemap_links(self, client: httpx.AsyncClient) -> List[str]:
#         """Return monthly feeds, filtering out current month & sections.xml."""
#         r = await client.get(self.index_url)
#         r.raise_for_status()

#         soup = BeautifulSoup(r.content, "xml")
#         raw_links = [
#             loc.get_text(strip=True)
#             for loc in soup.find_all("loc")
#             if loc.parent.name == "sitemap"
#         ]

#         # figure out YYYY/MM for 'today' (Singapore time is irrelevant for month test)
#         y_now, m_now = datetime.now(timezone.utc).year, datetime.now(timezone.utc).month

#         feeds: list[str] = []
#         for link in raw_links:
#             m = MONTH_FEED_RE.search(link)
#             if not m:                          # skips sections.xml & anything odd
#                 continue
#             yr, mo = int(m.group(1)), int(m.group(2))
#             if (yr, mo) == (y_now, m_now):     # skip current month
#                 continue
#             feeds.append(link)

#         return feeds

#     async def _month_urls(
#         self, feed_url: str, client: httpx.AsyncClient
#     ) -> Iterable[str]:
#         """Return every <loc> article URL from a single feeds.xml."""
#         r = await client.get(feed_url)
#         r.raise_for_status()
#         soup = BeautifulSoup(r.content, "xml")
#         return (loc.get_text(strip=True) for loc in soup.find_all("loc"))

#     async def _process_month(
#         self,
#         feed_url: str,
#         client: httpx.AsyncClient,
#         sem: asyncio.Semaphore,
#     ) -> None:
#         """Download one month feed, write out its TXT file."""
#         async with sem:                 # limit concurrent requests
#             try:
#                 urls = list(await self._month_urls(feed_url, client))
#             except httpx.HTTPError as e:
#                 print("   ERR ·", feed_url, "→", e)
#                 return

#             if not urls:
#                 print("   0   · (empty) ·", feed_url)
#                 return

#             # derive filename st_YYYY_MM.txt from the URL
#             m = MONTH_FEED_RE.search(feed_url)
#             fname = f"st_{m.group(1)}_{m.group(2)}.txt"
#             outpath = self.out_dir / fname
#             outpath.write_text("\n".join(urls), encoding="utf-8")

#             print(f"{len(urls):5d} · {outpath.relative_to(self.out_dir)}")
#             await asyncio.sleep(self.polite_delay)  # respectful crawl pace

In [None]:
# extractor = LinkExtractor(
#     index_url="https://www.straitstimes.com/sitemap.xml",
#     timeout=10,            # adjust as desired
#     polite_delay=0.5,      # half-second between requests
#     max_concurrency=8,     # higher = faster, but stay polite!
# )
# extractor.dump()           # sync call; runs an async loop under the hood

### Class to scrape 1 singular article

In [None]:
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup, Tag
from dateutil import parser as dateparser
from typing import Any, Dict, List, Optional


class ArticleScraper:
    #   Scrapes a page for <article> → images + captions.
    #   Each image is returned as its own JSON object with keys: site_title, publish_date, image_url, alt_text, caption

    def _extract_image_src(self, img_tag: Tag, page_url: str) -> Optional[str]:
        """
        Resolve relative/absolute URLs for the given <img>.
        """
        src = img_tag.get("src") or img_tag.get("data-src") or img_tag.get("data-original")
        return urljoin(page_url, src) if src else None

    def _extract_caption(self, img_tag: Tag) -> Optional[str]:
        # Return the best-guess caption for `img_tag`.

        # 1. Ideal case: <figure> → <figcaption>
        fig = img_tag.find_parent("figure")
        if fig:
            figcap = fig.find("figcaption")
            if figcap:
                text = figcap.get_text(" ", strip=True)
                if text:
                    return text

        # 2. Immediate siblings (<figcaption>, <p>, <span>)
        for sib in (img_tag.find_next_sibling(), img_tag.find_previous_sibling()):
            if sib and isinstance(sib, Tag) and sib.name in {"figcaption", "p", "span"}:
                text = sib.get_text(" ", strip=True)
                if text:
                    return text

        # 3. Any ancestor with caption-like class
        parent = img_tag.parent
        while parent and parent.name not in {"article", "body"}:
            classes = parent.get("class", [])
            if any(re.search(r"(caption|credit)", c, re.I) for c in classes):
                text = parent.get_text(" ", strip=True)
                if text:
                    return text
            parent = parent.parent

        # 4. Up to 3 forward/back block-level siblings
        for direction in ("next", "previous"):
            sib_iter = (
                img_tag.next_siblings if direction == "next" else img_tag.previous_siblings
            )
            count = 0
            for sib in sib_iter:
                if isinstance(sib, Tag) and sib.name in {"p", "div", "figcaption", "span"}:
                    text = sib.get_text(" ", strip=True)
                    if text:
                        return text
                    count += 1
                    if count == 3:  # stop after 3 hops
                        break

        # 5. Last resort: alt text
        alt = img_tag.get("alt", "").strip()
        return alt or None


    def scrape(self, url: str) -> List[Dict[str, Any]]:
        resp = requests.get(url, timeout=15)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.content, "html.parser")

        article = soup.find("article")
        if not article:
            raise RuntimeError("No <article> tag found")

        # Title
        h1 = article.find("h1")
        title = (
            h1.get_text(strip=True)
            if h1
            else (soup.title.string.strip() if soup.title else "(untitled)")
        )

        # Published date
        pub_date: Optional[str] = None
        time_tag = article.find("time")
        if time_tag and time_tag.has_attr("datetime"):
            pub_date = dateparser.parse(time_tag["datetime"]).isoformat()
        elif time_tag:
            pub_date = dateparser.parse(time_tag.get_text(strip=True)).isoformat()
        else:
            meta = soup.find("meta", {"property": "article:published_time"})
            if meta and meta.has_attr("content"):
                pub_date = dateparser.parse(meta["content"]).isoformat()

        # Collect images
        results: List[Dict[str, Any]] = []
        for img in article.find_all("img"):
            src = self._extract_image_src(img, url)
            if not src:
                continue

            results.append(
                {
                    "site_title": title,
                    "publish_date": pub_date,
                    "image_url": src,
                    "alt_text": img.get("alt", "").strip() or None,
                    "caption": self._extract_caption(img),
                }
            )

        return results

In [None]:
scraper = ArticleScraper()
images = scraper.scrape(
    "https://www.straitstimes.com/opinion/budget-2015-beware-the-trust-fund-kids-mindset"
)
for img in images:
    print(img)