In [0]:
import urllib.robotparser

robots_url = "https://rabobank.jobs/robots.txt"
target_url = "https://rabobank.jobs/en/jobs/"

rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

print("robots.txt URL:", robots_url)
print("Allowed to fetch job detail path?", rp.can_fetch("*", target_url))


In [0]:
import urllib.robotparser

robots_url = "https://werkenbij.devolksbank.nl/robots.txt"
target_url = "https://werkenbij.devolksbank.nl/nl/nl/search-results"

rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

print("robots.txt URL:", robots_url)
print("Target URL:", target_url)
print("Allowed?", rp.can_fetch("*", target_url))


In [0]:
# Databricks Notebook — Unified Scraper (ABN AMRO + ING + De Volksbank)
# Filters: Data Analyst / BI Analyst
# Output schema (same as ABN AMRO): vacancy_id, title, location_guess, url, sections_text, description_text, date_taken_utc
# Saves:
# 1) per-source CSV: abnamroYYYYMMDD.csv, ingYYYYMMDD.csv, devolksbankYYYYMMDD.csv
# 2) combined CSV: nl_banks_data_or_bi_analyst_YYYYMMDD.csv
# Volume target:
# /Volumes/skills_intelligence/00_job_posting_landing_zone/financial_sector/data_or_bi_analyst/

%pip install -q requests beautifulsoup4 pandas lxml


In [0]:
%restart_python

In [0]:
import re
import time
import json
import html as ihtml
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timezone

# =========================
# CONFIG
# =========================
BASE = "https://werkenbij.devolksbank.nl"
UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"

HEADERS = {
    "User-Agent": UA,
    "Accept-Language": "nl-NL,nl;q=0.9,en;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# We'll scan numeric job IDs like: /nl/nl/job/6217/Some-Title
START_ID = 6000
END_ID   = 8500

SLEEP_SEC = 0.35
MAX_CONSECUTIVE_MISSES = 200

# =========================
# FILTER: DATA ENGINEER (EN + NL)
# =========================
KEYWORD_PATTERNS = [
    # core titles
    r"\bdata engineer\b",
    r"\bsenior data engineer\b",
    r"\blead data engineer\b",
    r"\bprincipal data engineer\b",
    r"\bdata engineering\b",
    r"\banalytics engineer\b",
    r"\bdata platform engineer\b",

    # NL-ish variants (some companies still mix English title with NL posting)
    r"\bdata[- ]engineer\b",

    # responsibilities / stack signals
    r"\betl\b|\belt\b",
    r"\bdata pipeline(s)?\b",
    r"\bdata platform\b",
    r"\bdata warehouse\b|\bdwh\b",
    r"\blakehouse\b|\bdelta lake\b",
    r"\bspark\b|\bpyspark\b",
    r"\bdatabricks\b",
    r"\bairflow\b",
    r"\bkafka\b",
    r"\bdbt\b",
    r"\bazure\b|\baws\b|\bgcp\b",
    r"\bsynapse\b|\bfabric\b",
    r"\bsnowflake\b|\bbigquery\b",
    r"\bterraform\b",
]

# =========================
# OUTPUT (Unity Catalog Volume)
# =========================
VOLUME_DIR = "/Volumes/skills_intelligence/00_job_posting_landing_zone/financial_sector/data_engineer"
FILE_DATE = datetime.now(timezone.utc).strftime("%Y%m%d")
OUT_CSV_VOLUME = f"{VOLUME_DIR}/devolksbank_data_engineer{FILE_DATE}.csv"

print("CSV target:", OUT_CSV_VOLUME)

# =========================
# SESSION
# =========================
session = requests.Session()
session.headers.update(HEADERS)

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def is_relevant(title: str, description: str) -> bool:
    text = f"{title} {description}"
    return any(re.search(p, text, flags=re.I) for p in KEYWORD_PATTERNS)

def strip_html_to_text(html_str: str) -> str:
    """Convert HTML fragments to plain text safely."""
    if not html_str:
        return ""
    html_str = ihtml.unescape(html_str)
    soup = BeautifulSoup(html_str, "lxml")
    return norm(soup.get_text(" "))

def get_jobposting_ldjson(soup: BeautifulSoup) -> dict | None:
    """Find JobPosting JSON-LD (best for JS-heavy pages)."""
    scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
    for sc in scripts:
        raw = sc.string or sc.get_text() or ""
        raw = raw.strip()
        if not raw:
            continue
        try:
            data = json.loads(raw)
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for item in candidates:
            if isinstance(item, dict) and item.get("@type") == "JobPosting":
                return item
    return None

def location_from_jobposting(job: dict) -> str:
    """Extract location guess from JobPosting -> jobLocation."""
    loc = ""
    jl = job.get("jobLocation")
    if isinstance(jl, list) and jl:
        jl = jl[0]
    if isinstance(jl, dict):
        addr = jl.get("address")
        if isinstance(addr, dict):
            parts = []
            for k in ["streetAddress", "addressLocality", "addressRegion", "postalCode", "addressCountry"]:
                v = addr.get(k)
                if v:
                    parts.append(str(v))
            loc = ", ".join(parts)
    return norm(loc)

def fetch_vacancy(vac_id: int) -> dict | None:
    """
    Return dict schema IDENTICAL to ABN/ING:
    vacancy_id, title, location_guess, url, sections_text, description_text, date_taken_utc
    """
    date_taken_utc = datetime.now(timezone.utc)

    url = f"{BASE}/nl/nl/job/{vac_id}/"
    r = session.get(url, allow_redirects=True, timeout=30)
    if r.status_code != 200:
        return None

    soup = BeautifulSoup(r.text, "lxml")

    # Prefer JSON-LD JobPosting
    job = get_jobposting_ldjson(soup)

    title = ""
    description_text = ""
    location_guess = ""

    if job:
        title = norm(job.get("title", ""))
        description_text = strip_html_to_text(job.get("description", ""))
        location_guess = location_from_jobposting(job)

    # Fallback: use visible text
    if not title:
        h1 = soup.find("h1")
        title = norm(h1.get_text(" ")) if h1 else ""
    if not description_text:
        main = soup.find("main") or soup
        description_text = norm(main.get_text(" "))

    if not title:
        return None

    # sections_text: best-effort split by headings
    sections = []
    main = soup.find("main") or soup
    for hdr in main.find_all(["h2", "h3"]):
        header = norm(hdr.get_text(" "))
        if not header:
            continue
        parts = []
        for sib in hdr.find_all_next():
            if sib.name in ["h2", "h3"]:
                break
            if sib.name in ["p", "ul", "ol"]:
                parts.append(norm(sib.get_text(" ")))
        content = norm(" ".join([p for p in parts if p]))
        if len(content) >= 80:
            sections.append(f"{header}: {content}")
    sections_text = "\n\n".join(sections)

    # Heuristic location if still empty
    if not location_guess:
        txt = description_text[:1500].lower()
        for city in [
            "utrecht", "amsterdam", "den haag", "the hague", "rotterdam", "eindhoven",
            "hybride", "hybrid", "nederland", "netherlands"
        ]:
            if city in txt:
                location_guess = city.title()
                break

    return {
        "vacancy_id": str(vac_id),
        "title": title,
        "location_guess": location_guess,
        "url": r.url,
        "sections_text": sections_text,
        "description_text": description_text,
        "date_taken_utc": date_taken_utc,
    }

# =========================
# RUN SCRAPE (ID SCAN)
# =========================
rows = []
misses = 0
scanned = 0
valid_pages = 0

print(f"\nScanning De Volksbank job IDs {START_ID}..{END_ID} for Data Engineer ...\n")

for vac_id in range(START_ID, END_ID + 1):
    scanned += 1
    rec = None
    try:
        rec = fetch_vacancy(vac_id)
    except Exception:
        rec = None

    if rec is None:
        misses += 1
        if misses % 50 == 0:
            print(f"… up to ID {vac_id} | consecutive misses={misses} | matches={len(rows)}")
        if misses >= MAX_CONSECUTIVE_MISSES:
            print(f"Stopping early: {misses} consecutive misses (likely outside active ID range).")
            break
    else:
        valid_pages += 1
        misses = 0

        if is_relevant(rec["title"], rec["description_text"]):
            print(f"✅ MATCH {vac_id}: {rec['title']}")
            rows.append(rec)
        else:
            print(f"skip {vac_id}: {rec['title']}")

    time.sleep(SLEEP_SEC)

df = pd.DataFrame(rows)

# Exact column order (same as ABN/ING)
if not df.empty:
    df = df[[
        "vacancy_id",
        "title",
        "location_guess",
        "url",
        "sections_text",
        "description_text",
        "date_taken_utc",
    ]]

display(df)
print("\nDone.")
print(f"- Scanned IDs: {scanned}")
print(f"- Valid pages found: {valid_pages}")
print(f"- Matches (Data Engineer): {len(df)}")

# =========================
# SAVE CSV TO VOLUME
# =========================
if df is None or df.empty:
    print("No matches found — nothing to save.")
else:
    try:
        dbutils.fs.mkdirs(VOLUME_DIR)
    except Exception:
        pass

    df.to_csv(OUT_CSV_VOLUME, index=False)
    print("✅ Saved CSV to Volume:", OUT_CSV_VOLUME)
    display(dbutils.fs.ls(VOLUME_DIR))