In [0]:
import urllib.robotparser

robots_url = "https://careers.ing.com/robots.txt"
target_url = "https://careers.ing.com/en/job/"

rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

print("Allowed?", rp.can_fetch("*", target_url))

In [0]:
# Databricks Notebook — ING Job Scraper (Data Analyst / BI Analyst)
# Output schema matches ABN AMRO:
# vacancy_id, title, location_guess, url, sections_text, description_text, date_taken_utc
# Save to: /Volumes/skills_intelligence/00_job_posting_landing_zone/financial_sector/data_or_bi_analyst/ingYYYYMMDD.csv

%pip install -q requests beautifulsoup4 pandas lxml


In [0]:
%restart_python

In [0]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from urllib.parse import urljoin

# =========================
# CONFIG
# =========================
START_URL = "https://careers.ing.com/en/search-jobs/Netherlands/2618/2/2750405/52x25/5x75/50/2"

UA = "MarkusJobScraper/1.0 (+contact: youremail@example.com)"
HEADERS = {
    "User-Agent": UA,
    "Accept-Language": "en-GB,en;q=0.9,nl;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

SLEEP_SEC = 0.35
MAX_LISTING_PAGES = 40

# =========================
# FILTER: DATA ENGINEER (EN + NL)
# =========================
KEYWORD_PATTERNS = [
    # core titles
    r"\bdata engineer\b",
    r"\bsenior data engineer\b",
    r"\blead data engineer\b",
    r"\bprincipal data engineer\b",
    r"\bdata engineering\b",
    r"\banalytics engineer\b",
    r"\bdata platform engineer\b",

    # NL variants
    r"\bdata[- ]engineer\b",

    # stack & responsibilities (ING-style postings)
    r"\betl\b|\belt\b",
    r"\bdata pipeline(s)?\b",
    r"\bdata platform\b",
    r"\blakehouse\b|\bdelta lake\b",
    r"\bspark\b|\bpyspark\b",
    r"\bdatabricks\b",
    r"\bkafka\b",
    r"\bairflow\b",
    r"\bdbt\b",
    r"\bazure\b|\bsynapse\b|\bfabric\b",
    r"\bsnowflake\b|\bbigquery\b",
]

# =========================
# OUTPUT (Unity Catalog Volume)
# =========================
VOLUME_DIR = "/Volumes/skills_intelligence/00_job_posting_landing_zone/financial_sector/data_engineer"
FILE_DATE = datetime.now(timezone.utc).strftime("%Y%m%d")
OUT_CSV_VOLUME = f"{VOLUME_DIR}/ing_data_engineer{FILE_DATE}.csv"

print("CSV target:", OUT_CSV_VOLUME)

# =========================
# SESSION
# =========================
session = requests.Session()
session.headers.update(HEADERS)

def get_html(url: str, timeout=30) -> str:
    r = session.get(url, timeout=timeout)
    r.raise_for_status()
    return r.text

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def is_relevant(title: str, description: str) -> bool:
    text = f"{title} {description}"
    return any(re.search(p, text, flags=re.I) for p in KEYWORD_PATTERNS)

def extract_ing_job_id(job_url: str) -> str:
    m = re.search(r"/en/job/(\d+)", job_url)
    return m.group(1) if m else ""

def extract_job_links_and_next(listing_url: str):
    html = get_html(listing_url)
    soup = BeautifulSoup(html, "lxml")

    links = set()
    for a in soup.select("a[href]"):
        href = a.get("href") or ""
        if "/en/job/" in href:
            links.add(urljoin(listing_url, href))

    next_url = None
    for a in soup.find_all("a", href=True):
        if norm(a.get_text(" ")).lower() == "next":
            next_url = urljoin(listing_url, a["href"])
            break

    return links, next_url

def parse_sections(main: BeautifulSoup) -> str:
    sections = []
    for hdr in main.find_all(["h2", "h3"]):
        header = norm(hdr.get_text(" "))
        if not header:
            continue

        parts = []
        for sib in hdr.find_all_next():
            if sib.name in ["h2", "h3"]:
                break
            if sib.name in ["p", "ul", "ol"]:
                parts.append(norm(sib.get_text(" ")))

        content = norm(" ".join(parts))
        if len(content) >= 80:
            sections.append(f"{header}: {content}")

    return "\n\n".join(sections)

def parse_location_guess(full_text: str) -> str:
    lower = full_text.lower()
    for city in [
        "amsterdam", "utrecht", "rotterdam",
        "the hague", "den haag", "eindhoven",
        "netherlands", "nederland", "hybrid"
    ]:
        if city in lower:
            return city.title()
    return ""

def fetch_vacancy(job_url: str) -> dict | None:
    date_taken_utc = datetime.now(timezone.utc)

    html = get_html(job_url)
    soup = BeautifulSoup(html, "lxml")

    h1 = soup.find("h1")
    title = norm(h1.get_text(" ")) if h1 else ""
    if not title:
        return None

    main = soup.find("main") or soup
    description_text = norm(main.get_text(" "))

    return {
        "vacancy_id": extract_ing_job_id(job_url),
        "title": title,
        "location_guess": parse_location_guess(description_text),
        "url": job_url,
        "sections_text": parse_sections(main),
        "description_text": description_text,
        "date_taken_utc": date_taken_utc,
    }

# =========================
# 1) COLLECT JOB URLs
# =========================
all_job_urls = set()
url = START_URL

for page_no in range(1, MAX_LISTING_PAGES + 1):
    links, next_url = extract_job_links_and_next(url)
    all_job_urls |= links

    print(f"[Listing page {page_no}] collected {len(links)} | total {len(all_job_urls)}")

    if not next_url or next_url == url:
        break

    url = next_url
    time.sleep(SLEEP_SEC)

print(f"\nTotal ING job URLs collected: {len(all_job_urls)}")

# =========================
# 2) SCRAPE + FILTER
# =========================
rows = []
for i, job_url in enumerate(sorted(all_job_urls), start=1):
    try:
        rec = fetch_vacancy(job_url)
        if rec and is_relevant(rec["title"], rec["description_text"]):
            print(f"✅ MATCH [{i}/{len(all_job_urls)}] {rec['title']}")
            rows.append(rec)
        else:
            print(f"skip  [{i}/{len(all_job_urls)}]")
    except Exception as e:
        print(f"⚠️ error [{i}] {e}")

    time.sleep(SLEEP_SEC)

df = pd.DataFrame(rows)

if not df.empty:
    df = df[
        [
            "vacancy_id",
            "title",
            "location_guess",
            "url",
            "sections_text",
            "description_text",
            "date_taken_utc",
        ]
    ]

display(df)
print("Matches (Data Engineer):", len(df))

# =========================
# 3) SAVE CSV
# =========================
if df.empty:
    print("No matches found — nothing to save.")
else:
    try:
        dbutils.fs.mkdirs(VOLUME_DIR)
    except Exception:
        pass

    df.to_csv(OUT_CSV_VOLUME, index=False)
    print("✅ Saved CSV to Volume:", OUT_CSV_VOLUME)
    display(dbutils.fs.ls(VOLUME_DIR))
