## Imports

In [1]:
import json
import re
import time
import hashlib
from pathlib import Path
from typing import Any, Dict, List, Optional

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

## Setup

In [5]:
OUT_DIR = Path("out_afp_verificat")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://verificat.afp.com"
SEED_LISTS = ["https://verificat.afp.com/list/Romania"]

REQUEST_DELAY_SEC = 2.8
MAX_RETRIES = 4
TIMEOUT = 30

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "ro-RO,ro;q=0.9,en;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": BASE + "/",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

## Utils

In [3]:
def sha1(s: str) -> str:
    return hashlib.sha1((s or "").encode("utf-8")).hexdigest()

def md5(s: str) -> str:
    return hashlib.md5((s or "").encode("utf-8")).hexdigest()

def fetch_html(url: str, cache_dir: Path = OUT_DIR / "cache") -> str:
    cache_dir.mkdir(parents=True, exist_ok=True)
    cache_path = cache_dir / f"{sha1(url)}.html"
    if cache_path.exists():
        return cache_path.read_text(encoding="utf-8", errors="ignore")

    last_err = None
    for i in range(MAX_RETRIES):
        try:
            r = SESSION.get(url, timeout=TIMEOUT)
            if r.status_code == 403:
                raise RuntimeError(f"403 Forbidden for {url}")
            r.raise_for_status()
            html = r.text
            cache_path.write_text(html, encoding="utf-8")
            time.sleep(REQUEST_DELAY_SEC)
            return html
        except Exception as e:
            last_err = e
            time.sleep(REQUEST_DELAY_SEC * (i + 1))
    raise RuntimeError(f"Failed to fetch {url} after {MAX_RETRIES} retries. Last error: {last_err}")

DOC_URL_RE = re.compile(r"^https?://verificat\.afp\.com/doc\.afp\.com\.[A-Za-z0-9]+$", re.IGNORECASE)

def normalize_url(href: str) -> str:
    href = (href or "").strip()
    if not href:
        return ""
    if href.startswith("//"):
        href = "https:" + href
    if href.startswith("/"):
        href = BASE + href
    return href.split("#")[0]

def extract_doc_links(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")
    links: List[str] = []
    for a in soup.find_all("a", href=True):
        u = normalize_url(a["href"])
        if DOC_URL_RE.match(u):
            links.append(u)
    return list(dict.fromkeys(links))

def find_see_more_url(html: str) -> Optional[str]:
    soup = BeautifulSoup(html, "lxml")

    for a in soup.find_all("a", href=True):
        txt = " ".join((a.get_text() or "").split()).strip().lower()
        if "vezi mai mult" in txt or "see more" in txt:
            return normalize_url(a["href"])

    a_next = soup.find("a", attrs={"rel": lambda x: x and "next" in x}, href=True)
    if a_next:
        return normalize_url(a_next["href"])

    return None

def discover_urls(seed_url: str, max_steps: int = 80) -> List[str]:
    url = seed_url
    found: List[str] = []
    seen_pages = set()

    for step in range(max_steps):
        if not url or url in seen_pages:
            break
        seen_pages.add(url)

        html = fetch_html(url)
        found.extend(extract_doc_links(html))

        nxt = find_see_more_url(html)
        if not nxt:
            break
        url = nxt
        time.sleep(0.4)

    return list(dict.fromkeys(found))

## URL Discovery

In [6]:
discovered = []
for seed in SEED_LISTS:
    try:
        discovered.extend(discover_urls(seed, max_steps=80))
    except Exception as e:
        print(f"[WARN] seed failed: {seed} -> {e}")

discovered = list(dict.fromkeys(discovered))
len(discovered), discovered[:10]

[WARN] seed failed: https://verificat.afp.com/list/Romania -> Failed to fetch https://verificat.afp.com/list/Romania after 4 retries. Last error: 403 Forbidden for https://verificat.afp.com/list/Romania


(0, [])