In [0]:
import urllib.robotparser
from urllib.parse import urljoin

BASE = "https://www.werkenbijabnamro.nl/"
TARGET = "https://www.werkenbijabnamro.nl/vacatures"
UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"

robots_url = urljoin(BASE, "robots.txt")
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

print("robots.txt:", robots_url)
print("allowed?", rp.can_fetch(UA, TARGET))

In [0]:
import asyncio
import re
import time
import pandas as pd
from urllib.parse import urlparse
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

START_URL = "https://www.werkenbijabnamro.nl/vacatures"
KEYWORD = "data analyst"          # kamu bisa ganti: "data", "analytics", dll
MAX_PAGES = 50                    # safety
DELAY_SEC = 0.8                   # sopan: jeda antar page/detail request

def normalize_space(s: str) -> str:
    s = re.sub(r"\s+", " ", s or "").strip()
    return s

def same_domain(url: str, domain: str) -> bool:
    try:
        return urlparse(url).netloc.endswith(domain)
    except:
        return False

async def extract_job_detail(page, url: str) -> dict:
    await page.goto(url, wait_until="networkidle", timeout=60000)
    await page.wait_for_timeout(int(DELAY_SEC * 1000))

    # Title: try h1 first
    title = ""
    try:
        h1 = await page.query_selector("h1")
        if h1:
            title = normalize_space(await h1.inner_text())
    except:
        pass

    # Description: prioritize main/article, fallback to body text
    desc = ""
    for sel in ["main article", "main", "article", "[data-testid*='description']", ".vacancy", ".job", "body"]:
        try:
            el = await page.query_selector(sel)
            if el:
                txt = normalize_space(await el.inner_text())
                # ambil yang “cukup panjang” supaya tidak cuma menu/footer
                if len(txt) > len(desc):
                    desc = txt
        except:
            continue

    # Optional: try to detect location-like text (best effort)
    location = ""
    try:
        # cari elemen yang mengandung kata lokasi umum
        candidates = await page.query_selector_all("main *")
        for c in candidates[:200]:  # limit biar cepat
            t = normalize_space(await c.inner_text())
            if any(k in t.lower() for k in ["amsterdam", "utrecht", "rotterdam", "the hague", "den haag", "hybrid", "netherlands", "nederland"]):
                if 5 < len(t) < 120:
                    location = t
                    break
    except:
        pass

    return {
        "url": url,
        "title": title,
        "location_guess": location,
        "description": desc,
    }

async def collect_job_links(page) -> set[str]:
    # ambil semua link yang mengandung /vacatures/ (umumnya detail page)
    links = await page.eval_on_selector_all(
        "a[href]",
        """els => els
            .map(e => e.href)
            .filter(h => h && h.includes('/vacatures/'))"""
    )
    return set(links)

async def click_next_if_exists(page) -> bool:
    # beberapa kemungkinan tombol next
    selectors = [
        "a[rel='next']",
        "a[aria-label*='Next' i]",
        "button[aria-label*='Next' i]",
        "a:has-text('Next')",
        "button:has-text('Next')",
        "a:has-text('Volgende')",
        "button:has-text('Volgende')",
        "a:has-text('>')",
    ]
    for sel in selectors:
        try:
            el = await page.query_selector(sel)
            if el:
                await el.click()
                await page.wait_for_load_state("networkidle")
                await page.wait_for_timeout(int(DELAY_SEC * 1000))
                return True
        except:
            continue
    return False

async def main():
    domain = "werkenbijabnamro.nl"
    keyword = KEYWORD.lower()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="MarkusJobScraper/1.0 (+contact: youremail@example.com)"
        )
        page = await context.new_page()

        # 1) Crawl listing pages -> kumpulkan semua link detail
        await page.goto(START_URL, wait_until="networkidle", timeout=60000)
        await page.wait_for_timeout(int(DELAY_SEC * 1000))

        all_links = set()
        for i in range(1, MAX_PAGES + 1):
            links = await collect_job_links(page)
            links = {u for u in links if same_domain(u, domain)}
            before = len(all_links)
            all_links |= links

            print(f"[Listing page {i}] got {len(links)} links, total unique {len(all_links)}")

            # stop kalau tidak ada tombol next atau tidak ada link baru (heuristic)
            clicked = await click_next_if_exists(page)
            if not clicked:
                break
            if len(all_links) == before and i >= 2:
                # tidak nambah link; kemungkinan pagination habis / loop
                break

        # 2) Buka tiap job page -> extract detail & filter keyword
        detail_page = await context.new_page()
        rows = []
        for idx, url in enumerate(sorted(all_links), start=1):
            try:
                data = await extract_job_detail(detail_page, url)

                haystack = f"{data.get('title','')} {data.get('description','')}".lower()
                if keyword in haystack:
                    print(f"✅ [{idx}/{len(all_links)}] MATCH: {data.get('title','(no title)')}")
                    rows.append(data)
                else:
                    print(f"… [{idx}/{len(all_links)}] skip")
            except PlaywrightTimeoutError:
                print(f"⚠️ timeout: {url}")
            except Exception as e:
                print(f"⚠️ error: {url} -> {e}")

        await browser.close()

    # 3) Save
    df = pd.DataFrame(rows)
    df.to_csv("abnamro_data_analyst_jobs.csv", index=False)
    print("\nSaved: abnamro_data_analyst_jobs.csv")
    print(df[["title", "url"]].head(20))

if __name__ == "__main__":
    asyncio.run(main())

In [0]:
%pip install -q requests beautifulsoup4 pandas lxml

In [0]:
%restart_python

In [0]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

BASE = "https://www.werkenbijabnamro.nl"
UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"

HEADERS = {
    "User-Agent": UA,
    "Accept-Language": "en-GB,en;q=0.9,nl;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# >>> TUNING PARAMS <<<
START_ID = 8300
END_ID   = 9500
SLEEP_SEC = 0.6

# stop lebih cepat kalau sudah banyak miss beruntun
MAX_CONSECUTIVE_MISSES = 250

# filter: title/desc harus contain salah satu ini
KEYWORDS = [
    "data analyst",
    "senior data analyst",
    "analytics analyst",
    "bi analyst",
    "reporting analyst",
]

session = requests.Session()
session.headers.update(HEADERS)

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def fetch_vacancy_by_id(vac_id: int) -> dict | None:
    """
    Coba akses /en/vacancy/<id> (tanpa slug).
    Umumnya website akan redirect ke /en/vacancy/<id>/<slug>.
    """
    url = f"{BASE}/en/vacancy/{vac_id}"
    r = session.get(url, allow_redirects=True, timeout=30)

    if r.status_code != 200:
        return None

    final_url = r.url
    html = r.text
    soup = BeautifulSoup(html, "lxml")

    h1 = soup.find("h1")
    title = norm(h1.get_text(" ")) if h1 else ""

    main = soup.find("main") or soup
    full_text = norm(main.get_text(" "))

    # ambil section2 kalau ada (h2/h3)
    sections = []
    for hdr in main.find_all(["h2", "h3"]):
        header = norm(hdr.get_text(" "))
        if not header:
            continue
        parts = []
        for sib in hdr.find_all_next():
            if sib.name in ["h2", "h3"]:
                break
            if sib.name in ["p", "ul", "ol"]:
                parts.append(norm(sib.get_text(" ")))
        content = norm(" ".join([p for p in parts if p]))
        if content and len(content) > 80:
            sections.append(f"{header}: {content}")

    return {
        "vacancy_id": vac_id,
        "url": final_url,
        "title": title,
        "sections_text": "\n\n".join(sections),
        "description_text": full_text,
    }

def is_relevant(rec: dict) -> bool:
    hay = f"{rec.get('title','')} {rec.get('description_text','')}".lower()
    return any(k in hay for k in KEYWORDS)

rows = []
misses = 0

for vac_id in range(START_ID, END_ID + 1):
    rec = None
    try:
        rec = fetch_vacancy_by_id(vac_id)
    except Exception as e:
        # treat error as miss (network hiccup)
        rec = None

    if rec is None or not rec.get("title"):
        misses += 1
        if misses % 50 == 0:
            print(f"… scanned up to {vac_id}, consecutive misses={misses}")
        if misses >= MAX_CONSECUTIVE_MISSES:
            print(f"Stopping early: {misses} consecutive misses (likely passed current ID range).")
            break
    else:
        misses = 0
        if is_relevant(rec):
            print(f"✅ MATCH {vac_id}: {rec['title']}")
            rows.append(rec)
        else:
            print(f"skip {vac_id}: {rec['title']}")

    time.sleep(SLEEP_SEC)

df = pd.DataFrame(rows)
display(df[["vacancy_id","title","url"]])
print(f"Done. Matches: {len(df)}")


In [0]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

# =========================
# CONFIG
# =========================
BASE = "https://www.werkenbijabnamro.nl"
UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"

HEADERS = {
    "User-Agent": UA,
    "Accept-Language": "en-GB,en;q=0.9,nl;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# Vacancy IDs (tune for speed; if you don't know, start with 8500–9500)
START_ID = 8500
END_ID   = 9500

SLEEP_SEC = 0.35                 # be polite
MAX_CONSECUTIVE_MISSES = 200      # auto-stop if we've likely passed valid range

# Filter: Data Analyst / BI Analyst (EN + NL)
KEYWORD_PATTERNS = [
    r"\bdata analyst\b",
    r"\bsenior data analyst\b",
    r"\bbi analyst\b",
    r"\bbusiness intelligence analyst\b",
    r"\bdata analist\b",
    r"\bbi analist\b",
]

# Output paths
OUT_CSV_DBFS = "/dbfs/tmp/abnamro_data_bi_analyst_jobs.csv"
OUT_DELTA_PATH = "/tmp/abnamro_data_bi_analyst_jobs"  # DBFS path for Delta

# =========================
# HELPERS
# =========================
session = requests.Session()
session.headers.update(HEADERS)

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def is_relevant(title: str, description: str) -> bool:
    text = f"{title} {description}"
    return any(re.search(pat, text, flags=re.I) for pat in KEYWORD_PATTERNS)

def fetch_vacancy(vac_id: int) -> dict | None:
    """
    Try /en/vacancy/<id> (often redirects to /en/vacancy/<id>/<slug>)
    Return parsed vacancy dict or None if not found.
    """
    url = f"{BASE}/en/vacancy/{vac_id}"
    r = session.get(url, allow_redirects=True, timeout=30)
    if r.status_code != 200:
        return None

    soup = BeautifulSoup(r.text, "lxml")
    main = soup.find("main") or soup

    # title
    h1 = soup.find("h1")
    title = norm(h1.get_text(" ")) if h1 else ""
    if not title:
        return None

    # full text (for keyword/NLP)
    full_text = norm(main.get_text(" "))

    # structured sections (optional, nice for portfolio)
    sections = []
    for hdr in main.find_all(["h2", "h3"]):
        header = norm(hdr.get_text(" "))
        if not header:
            continue

        parts = []
        for sib in hdr.find_all_next():
            if sib.name in ["h2", "h3"]:
                break
            if sib.name in ["p", "ul", "ol"]:
                parts.append(norm(sib.get_text(" ")))

        content = norm(" ".join([p for p in parts if p]))
        if len(content) >= 80:
            sections.append(f"{header}: {content}")

    # lightweight "location guess" (best effort)
    location_guess = ""
    top_text = full_text[:1000].lower()
    for city in ["amsterdam", "utrecht", "rotterdam", "the hague", "den haag", "diemen", "amstelveen", "netherlands", "nederland", "hybrid"]:
        if city in top_text:
            location_guess = city.title()
            break

    return {
        "vacancy_id": vac_id,
        "url": r.url,  # final url after redirect
        "title": title,
        "location_guess": location_guess,
        "sections_text": "\n\n".join(sections),
        "description_text": full_text,
    }

# =========================
# RUN
# =========================
rows = []
misses = 0
scanned = 0
found_pages = 0

print(f"Scanning vacancy IDs {START_ID}..{END_ID} for Data Analyst / BI Analyst ...")

for vac_id in range(START_ID, END_ID + 1):
    scanned += 1
    rec = None
    try:
        rec = fetch_vacancy(vac_id)
    except Exception:
        rec = None

    if rec is None:
        misses += 1
        if misses % 50 == 0:
            print(f"… up to ID {vac_id} | consecutive misses={misses} | total matches={len(rows)}")
        if misses >= MAX_CONSECUTIVE_MISSES:
            print(f"Stopping early: {misses} consecutive misses (likely passed active ID range).")
            break
    else:
        found_pages += 1
        misses = 0

        if is_relevant(rec["title"], rec["description_text"]):
            print(f"✅ MATCH {vac_id}: {rec['title']}")
            rows.append(rec)
        else:
            print(f"skip {vac_id}: {rec['title']}")

    time.sleep(SLEEP_SEC)

df = pd.DataFrame(rows)

# Keep nice columns order
if not df.empty:
    df = df[[
        "vacancy_id",
        "title",
        "location_guess",
        "url",
        "sections_text",
        "description_text",
    ]]

display(df)
print(f"\nDone.")
print(f"- Scanned IDs: {scanned}")
print(f"- Valid pages found: {found_pages}")
print(f"- Matches (Data/BI Analyst): {len(df)}")

In [0]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timezone

# =========================
# CONFIG
# =========================
BASE = "https://www.werkenbijabnamro.nl"
UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"

HEADERS = {
    "User-Agent": UA,
    "Accept-Language": "en-GB,en;q=0.9,nl;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# Vacancy ID scan range (tune as needed)
START_ID = 8500
END_ID   = 9500

SLEEP_SEC = 0.35
MAX_CONSECUTIVE_MISSES = 200

# =========================
# FILTER: DATA ENGINEER (EN + NL)
# =========================
KEYWORD_PATTERNS = [
    # core titles
    r"\bdata engineer\b",
    r"\bsenior data engineer\b",
    r"\blead data engineer\b",
    r"\bprincipal data engineer\b",
    r"\bdata engineering\b",
    r"\bdata platform engineer\b",
    r"\bplatform engineer\b",

    # NL variants
    r"\bdata engineer\b",            # same in NL
    r"\bdata engineer(s)?\b",
    r"\bdata[- ]engineer\b",

    # adjacent titles
    r"\banalytics engineer\b",
    r"\bdata ops\b|\bdataops\b",
    r"\bmlops\b",
    r"\bdevops\b.*\bdata\b",
    r"\bcloud engineer\b.*\bdata\b",

    # common stack keywords (to catch postings that don't say "data engineer" in title)
    r"\betl\b|\belt\b",
    r"\bdata pipeline(s)?\b",
    r"\bdata warehouse\b|\bdwh\b",
    r"\bdelta lake\b|\blakehouse\b",
    r"\bspark\b|\bpyspark\b",
    r"\bdatabricks\b",
    r"\bkafka\b",
    r"\bairflow\b",
    r"\bdbt\b",
    r"\bazure\b|\baws\b|\bgcp\b",
    r"\bsynapse\b|\bfabric\b",
    r"\bbigquery\b|\bsnowflake\b",
    r"\bterraform\b",
]

# Output folder (Unity Catalog Volume)
VOLUME_DIR = "/Volumes/skills_intelligence/00_job_posting_landing_zone/financial_sector/data_engineer"
FILE_DATE = datetime.now(timezone.utc).strftime("%Y%m%d")
OUT_CSV_VOLUME = f"{VOLUME_DIR}/abnamro_data_engineer{FILE_DATE}.csv"

print("CSV target:", OUT_CSV_VOLUME)

# =========================
# HELPERS
# =========================
session = requests.Session()
session.headers.update(HEADERS)

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def is_relevant(title: str, description: str) -> bool:
    text = f"{title} {description}"
    return any(re.search(pat, text, flags=re.I) for pat in KEYWORD_PATTERNS)

def fetch_vacancy(vac_id: int) -> dict | None:
    """
    Try /en/vacancy/<id> (often redirects to /en/vacancy/<id>/<slug>)
    Return parsed vacancy dict or None if not found.
    Includes per-record date_taken_utc.
    """
    date_taken_utc = datetime.now(timezone.utc)

    url = f"{BASE}/en/vacancy/{vac_id}"
    r = session.get(url, allow_redirects=True, timeout=30)
    if r.status_code != 200:
        return None

    soup = BeautifulSoup(r.text, "lxml")
    main = soup.find("main") or soup

    # Title
    h1 = soup.find("h1")
    title = norm(h1.get_text(" ")) if h1 else ""
    if not title:
        return None

    # Full text
    full_text = norm(main.get_text(" "))

    # Structured sections
    sections = []
    for hdr in main.find_all(["h2", "h3"]):
        header = norm(hdr.get_text(" "))
        if not header:
            continue

        parts = []
        for sib in hdr.find_all_next():
            if sib.name in ["h2", "h3"]:
                break
            if sib.name in ["p", "ul", "ol"]:
                parts.append(norm(sib.get_text(" ")))

        content = norm(" ".join([p for p in parts if p]))
        if len(content) >= 80:
            sections.append(f"{header}: {content}")

    # Best-effort location guess
    location_guess = ""
    top_text = full_text[:1200].lower()
    for city in [
        "amsterdam", "utrecht", "rotterdam", "the hague",
        "den haag", "diemen", "amstelveen",
        "netherlands", "nederland", "hybrid"
    ]:
        if city in top_text:
            location_guess = city.title()
            break

    return {
        "vacancy_id": vac_id,
        "url": r.url,
        "title": title,
        "location_guess": location_guess,
        "sections_text": "\n\n".join(sections),
        "description_text": full_text,
        "date_taken_utc": date_taken_utc,
    }

# =========================
# RUN SCRAPE
# =========================
rows = []
misses = 0
scanned = 0
valid_pages = 0

print(f"\nScanning vacancy IDs {START_ID}..{END_ID} for Data Engineer ...\n")

for vac_id in range(START_ID, END_ID + 1):
    scanned += 1
    rec = None
    try:
        rec = fetch_vacancy(vac_id)
    except Exception:
        rec = None

    if rec is None:
        misses += 1
        if misses % 50 == 0:
            print(f"… up to ID {vac_id} | consecutive misses={misses} | matches={len(rows)}")
        if misses >= MAX_CONSECUTIVE_MISSES:
            print(f"Stopping early: {misses} consecutive misses (likely past active ID range).")
            break
    else:
        valid_pages += 1
        misses = 0

        if is_relevant(rec["title"], rec["description_text"]):
            print(f"✅ MATCH {vac_id}: {rec['title']}")
            rows.append(rec)
        else:
            print(f"skip {vac_id}: {rec['title']}")

    time.sleep(SLEEP_SEC)

df = pd.DataFrame(rows)

if not df.empty:
    df = df[[
        "vacancy_id",
        "title",
        "location_guess",
        "url",
        "sections_text",
        "description_text",
        "date_taken_utc",
    ]]

display(df)
print("\nDone.")
print(f"- Scanned IDs: {scanned}")
print(f"- Valid pages found: {valid_pages}")
print(f"- Matches (Data Engineer): {len(df)}")

# =========================
# SAVE CSV TO VOLUME
# =========================
if df is None or df.empty:
    print("No matches found — nothing to save.")
else:
    try:
        dbutils.fs.mkdirs(VOLUME_DIR)
    except Exception:
        pass

    df.to_csv(OUT_CSV_VOLUME, index=False)
    print("✅ Saved CSV to Volume:", OUT_CSV_VOLUME)
    display(dbutils.fs.ls(VOLUME_DIR))