In [None]:
# If needed (Colab, fresh venv), uncomment:
# !pip install -q requests beautifulsoup4 tqdm pandas

from __future__ import annotations
import os, re, time, sys, urllib.parse, dataclasses, typing as T
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm

BASE_INDEX = "https://elibrary.judiciary.gov.ph/philippinereports"
DEFAULT_HEADERS = {
    "User-Agent": "PhilReportsEPUBFetcher/1.0 (research; respectful-crawl)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

VOL_PATTERNS = [
    re.compile(r"\bVOL\.?\s*(\d{1,4})\b", re.I),
    re.compile(r"\bVol(?:ume)?\s*(\d{1,4})\b", re.I),
    re.compile(r"/philippinereports/(\d{1,4})\b", re.I),
]

@dataclasses.dataclass
class VolumeLink:
    volume_no: T.Optional[int]
    url: str
    text: str

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
def _sleep(throttle: float):
    if throttle and throttle > 0:
        time.sleep(throttle)

def fetch(url: str, session: requests.Session, timeout: float = 25.0, retries: int = 4):
    last_err = None
    for attempt in range(retries):
        try:
            resp = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
            if resp.status_code >= 500:
                _sleep(1.5 * (attempt + 1))
                continue
            return resp
        except requests.RequestException as e:
            last_err = e
            _sleep(1.5 * (attempt + 1))
    # final try (will raise if it fails)
    return session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)

def guess_volume_no(text: str, href: str) -> T.Optional[int]:
    for hay in (text or "", href or ""):
        for rx in VOL_PATTERNS:
            m = rx.search(hay)
            if m:
                try:
                    return int(m.group(1))
                except:
                    pass
    return None

def find_volume_links(index_html: str, index_url: str, max_items: int = 0) -> list[VolumeLink]:
    soup = BeautifulSoup(index_html, "html.parser")
    links: list[VolumeLink] = []
    for a in soup.find_all("a", href=True):
        href = urllib.parse.urljoin(index_url, a["href"])
        txt = " ".join((a.get_text(strip=True) or "").split())
        if "/philippinereports" in href.lower() or "philippine reports" in txt.lower():
            vol = guess_volume_no(txt, href)
            links.append(VolumeLink(vol, href, txt))
    # dedupe by URL
    dedup = {l.url: l for l in links}
    links = list(dedup.values())
    links.sort(key=lambda x: (x.volume_no is None, x.volume_no or 10**9, x.url))
    if max_items and max_items > 0:
        links = links[:max_items]
    return links

def find_pdf_links(vol_html: str, vol_url: str) -> list[str]:
    """Find all absolute PDF links within a volume page."""
    soup = BeautifulSoup(vol_html, "html.parser")
    found = set()
    for a in soup.find_all("a", href=True):
        if ".pdf" in a["href"].lower():
            found.add(urllib.parse.urljoin(vol_url, a["href"]))
    for tag in soup.find_all(True):
        for k, v in tag.attrs.items():
            if isinstance(v, str) and ".pdf" in v.lower():
                found.add(urllib.parse.urljoin(vol_url, v))
    return sorted(found)

def sanitize(s: str) -> str:
    return re.sub(r'[\\/*?":<>|]', "_", s)

def out_filename(pdf_url: str, volume_no: T.Optional[int]) -> str:
    base = os.path.basename(urllib.parse.urlparse(pdf_url).path)
    stem = base[:-4] if base.lower().endswith(".pdf") else base
    prefix = f"PR_Vol_{volume_no}_" if volume_no is not None else "PR_Vol_unknown_"
    return sanitize(prefix + (stem or "philippine_reports")) + ".pdf"


In [27]:
def preview_index(base_url: str, throttle: float = 2.0, max_items: int = 0) -> pd.DataFrame:
    """Return candidate Philippine Reports volume links (no download)."""
    with requests.Session() as s:
        _sleep(throttle)
        r = fetch(base_url, s)
        r.raise_for_status()
        vols = find_volume_links(r.text, r.url, max_items=max_items)
    df = pd.DataFrame([dataclasses.asdict(v) for v in vols])
    if not df.empty:
        df = df.sort_values(by=["volume_no", "url"], na_position="last").reset_index(drop=True)
    return df

def filter_volumes(df: pd.DataFrame, min_vol: int | None = None, max_vol: int | None = None, only_regex: str | None = None) -> pd.DataFrame:
    cur = df.copy()
    if min_vol is not None:
        cur = cur[(cur["volume_no"].isna()) | (cur["volume_no"] >= min_vol)]
    if max_vol is not None:
        cur = cur[(cur["volume_no"].isna()) | (cur["volume_no"] <= max_vol)]
    if only_regex:
        rx = re.compile(only_regex)
        cur = cur[cur["volume_no"].astype("Int64").astype(str).str.fullmatch(rx.pattern, na=False)]
    return cur.reset_index(drop=True)

def collect_pdfs(volume_rows: pd.DataFrame, throttle: float = 2.0) -> pd.DataFrame:
    """Visit each volume page and list PDF links (no download)."""
    rows = []
    with requests.Session() as s:
        for _, r in tqdm(volume_rows.iterrows(), total=len(volume_rows), desc="Scanning volume pages"):
            _sleep(throttle)
            resp = fetch(r["url"], s)
            if resp.status_code != 200:
                rows.append({"volume_no": r["volume_no"], "volume_url": r["url"], "pdf_url": None, "status": resp.status_code})
                continue
            pdfs = find_pdf_links(resp.text, resp.url)
            if not pdfs:
                rows.append({"volume_no": r["volume_no"], "volume_url": r["url"], "pdf_url": None, "status": 404})
            else:
                for p in pdfs:
                    rows.append({"volume_no": r["volume_no"], "volume_url": r["url"], "pdf_url": p, "status": 200})
    return pd.DataFrame(rows)

def download_pdfs(pdf_rows: pd.DataFrame, out_dir: str = "./pdfs", throttle: float = 2.0) -> pd.DataFrame:
    os.makedirs(out_dir, exist_ok=True)
    results = []
    with requests.Session() as s:
        valid_rows = pdf_rows.dropna(subset=["pdf_url"])
        for _, row in tqdm(valid_rows.iterrows(), total=len(valid_rows), desc="Downloading PDFs"):
            url = row["pdf_url"]
            vol = row["volume_no"]
            fname = out_filename(url, vol)
            path = os.path.join(out_dir, fname)

            if os.path.exists(path) and os.path.getsize(path) > 0:
                results.append({"pdf_url": url, "path": path, "status": "skip_exists"})
                continue

            _sleep(throttle)
            try:
                with s.get(url, headers=DEFAULT_HEADERS, stream=True, timeout=60) as r:
                    r.raise_for_status()
                    tmp = path + ".part"
                    with open(tmp, "wb") as f:
                        for chunk in r.iter_content(chunk_size=1 << 15):
                            if chunk:
                                f.write(chunk)
                    os.replace(tmp, path)
                results.append({"pdf_url": url, "path": path, "status": "ok"})
            except Exception as e:
                results.append({"pdf_url": url, "path": path, "status": f"error: {e}"})
    return pd.DataFrame(results)

In [28]:
# 1) Preview the index (no downloads yet)
df_index = preview_index(BASE_INDEX, throttle=2.0, max_items=0)  # max_items=0 => no limit
print(f"Detected {len(df_index)} candidate volume links")
df_index.head(10)

Detected 2 candidate volume links


Unnamed: 0,volume_no,url,text
0,910.0,https://elibrary.judiciary.gov.ph/assets/pdf/p...,"PHILIPPINE REPORTS VOL. 910 (SEPTEMBER 13, 202..."
1,,https://elibrary.judiciary.gov.ph/philippinere...,A++


In [29]:
# 2) Filter (examples)
#   a) Limit to volumes 900–999
df_vols = filter_volumes(df_index, min_vol=900, max_vol=999)

#   b) Or use a regex: only volumes in the 960s
# df_vols = filter_volumes(df_index, only_regex=r"^96\d$")

df_vols.head(10)


Unnamed: 0,volume_no,url,text
0,910.0,https://elibrary.judiciary.gov.ph/assets/pdf/p...,"PHILIPPINE REPORTS VOL. 910 (SEPTEMBER 13, 202..."
1,,https://elibrary.judiciary.gov.ph/philippinere...,A++


In [30]:
# 3) Collect EPUB links (no downloads yet) — good for verification
df_pdfs = collect_pdfs(df_vols, throttle=2.0)
print("PDF link candidates:", len(df_pdfs))
df_pdfs.head(10)


Scanning volume pages: 100%|██████████| 2/2 [00:12<00:00,  6.16s/it]

EPUB link candidates: 827





Unnamed: 0,volume_no,volume_url,pdf_url,status
0,910.0,https://elibrary.judiciary.gov.ph/assets/pdf/p...,,404
1,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
2,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
3,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
4,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
5,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
6,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
7,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
8,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200
9,,https://elibrary.judiciary.gov.ph/philippinere...,https://elibrary.judiciary.gov.ph/assets/pdf/p...,200


In [31]:
df_pdfs_first5 = df_pdfs[df_pdfs["status"] == 200].head(5)

In [None]:
# 4️⃣ Download those EPUBs
dl_first5 = download_pdfs(df_pdfs_first5[df_pdfs_first5["status"] == 200],
                           out_dir="./phil_reports_pdf_first5",
                           throttle=2.0)
dl_first5.value_counts("status")


Downloading PDFs:   0%|          | 0/5 [00:00<?, ?it/s]