# GR_AdviesScraper

Doel: GR adviezen uitgebracht vanaf 2019 tot en met 2024 verzamelen.

Toepassing: Bachelor-scriptie Informatiekunde, Universiteit van Amsterdam

Contact: mitchell.malaihollo@student.uva.nl

Metadata Formatering volgens de [Woogle metadata specificatie](https://github.com/wooverheid/WoogleDocumentatie/tree/master/SPEC%20MetadataSchema) 

In [1]:
import time
from urllib.parse import urljoin
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from openai import OpenAI
from dotenv import load_dotenv
import json
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urljoin, urlparse
import re

In [2]:
HREF_ID_WOORDEN = {"briefadvies", "wetsadvies", "advies", "adviesrapport", "adviesproject"}
ID_WOORDEN = {"briefadvies", "advies:", "advies", "adviesrapport", "adviesproject", "wetsadvies", "adviesvraag"}
NIET_ID_WOORDEN = {"samenvatting", "aanbiedingsbrief"}
NIET_HREF_ID_WOORDEN = {}

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) "
    "Gecko/20100101 Firefox/124.0 "
    "AdviescollegeScraper (+mailto:14605120@student.uva.nl; "
    "voor BSc-scriptie Informatiekunde UvA; "
    "doel: informatie inwinnen over verschillende adviescolleges)"
)

BASE_URL = (
    "https://www.gezondheidsraad.nl/documenten?"
    "filters%5B0%5D%5Bfield%5D=publication_date"
    "&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bfrom%5D=2019-01-01T00%3A00%3A00%2B01%3A00"
    "&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bname%5D=specificPeriodFrom"
    "&filters%5B0%5D%5Bvalues%5D%5B1%5D%5Bto%5D=2024-12-31T23%3A59%3A59%2B01%3A00"
    "&filters%5B0%5D%5Bvalues%5D%5B1%5D%5Bname%5D=specificPeriodTo"
    "&filters%5B0%5D%5Btype%5D=all"
    "&filters%5B1%5D%5Bfield%5D=information_type"
    "&filters%5B1%5D%5Bvalues%5D%5B0%5D=Advies"
    "&filters%5B1%5D%5Btype%5D=all"
    "&filters%5B1%5D%5Bpersistent%5D=false"
)

env_path = r"C:\Users\31633\OneDrive\Bureaublad\OpenAI_API_Key.env"  
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")
print("Key geladen:", api_key is not None)

Key geladen: True


In [3]:
def beginSelenium():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(f"user-agent={USER_AGENT}")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

def aantalPaginas(driver):
    x = driver.find_element(
        By.CSS_SELECTOR,
        "#main-content > div.search-block > div.search-block__results > nav > ul > li:nth-child(7) > button > span.sr-only"
    )
    y = x.text.strip()
    pagina = re.sub(r"\D", "", y)
    return int(pagina)

driver = beginSelenium()
driver.get(BASE_URL)
time.sleep(2)

aantalPagina = aantalPaginas(driver)
print("Maximaal aantal pagina's om te controleren: " + str(aantalPagina))
print("-----------------------")

adviesURLS = []

for paginaNummer in range(1, aantalPagina + 1):
    if paginaNummer == 1:
        BASE_paginaURL = BASE_URL
    else:
        BASE_paginaURL = f"{BASE_URL}&current=n_{paginaNummer}_n"

    print("Pagina: " + str(paginaNummer))
    print(BASE_paginaURL)

    driver.get(BASE_paginaURL)
    time.sleep(2)

    li_items = driver.find_elements(
        By.CSS_SELECTOR,
        "ul.sui-results-container.links-list li.links-list__item"
    )
    count = 0
    for li in li_items:
        title_element = li.find_element(By.CSS_SELECTOR, "h3.link-plus__title")
        title_text = title_element.text.strip().lower()
        eerste_woord_titel = title_text.split()[0] if title_text else ""

        meta_spans = li.find_elements(
            By.CSS_SELECTOR,
            ".meta-data.link-plus__meta-data span"
        )

        meta_woorden = " ".join(
            span.text.strip().lower()
            for span in meta_spans
            if span.text.strip()
        )

        if meta_spans:
            doc_type = meta_spans[0].text.strip().lower()
        else:
            doc_type = ""

        alle_woorden = f"{title_text} {meta_woorden}".strip()

        # link + href
        a_tag = li.find_element(By.CSS_SELECTOR, "a.link-plus")
        href = a_tag.get_attribute("href")

        # eerste woord van de href
        href_delen = urlparse(href).path
        href_deel = href_delen.strip("/").split("/")[-1]
        eerste_woord = href_deel.split("-")[0].lower()

        # metadata check
        if (
            "advies" in doc_type
            and eerste_woord_titel not in NIET_ID_WOORDEN
            and eerste_woord not in NIET_HREF_ID_WOORDEN
            and "achtergronddocument" not in alle_woorden
            and "podcast" not in alle_woorden
        ):
            adviesURLS.append(href)
            count += 1
            continue

        # href check
        if eerste_woord in HREF_ID_WOORDEN:
            adviesURLS.append(href)
            count += 1
            continue

        # titel check
        if eerste_woord_titel in ID_WOORDEN:
            adviesURLS.append(href)
            count += 1
            continue
    time.sleep(5)
    print("aantal adviezen op de pagina: " + str(count))
    print("-------------------")

driver.quit()

Maximaal aantal pagina's om te controleren: 27
-----------------------
Pagina: 1
https://www.gezondheidsraad.nl/documenten?filters%5B0%5D%5Bfield%5D=publication_date&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bfrom%5D=2019-01-01T00%3A00%3A00%2B01%3A00&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bname%5D=specificPeriodFrom&filters%5B0%5D%5Bvalues%5D%5B1%5D%5Bto%5D=2024-12-31T23%3A59%3A59%2B01%3A00&filters%5B0%5D%5Bvalues%5D%5B1%5D%5Bname%5D=specificPeriodTo&filters%5B0%5D%5Btype%5D=all&filters%5B1%5D%5Bfield%5D=information_type&filters%5B1%5D%5Bvalues%5D%5B0%5D=Advies&filters%5B1%5D%5Btype%5D=all&filters%5B1%5D%5Bpersistent%5D=false
aantal adviezen op de pagina: 7
-------------------
Pagina: 2
https://www.gezondheidsraad.nl/documenten?filters%5B0%5D%5Bfield%5D=publication_date&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bfrom%5D=2019-01-01T00%3A00%3A00%2B01%3A00&filters%5B0%5D%5Bvalues%5D%5B0%5D%5Bname%5D=specificPeriodFrom&filters%5B0%5D%5Bvalues%5D%5B1%5D%5Bto%5D=2024-12-31T23%3A59%3A59%2B01%3A00&filters%5B0%5

In [4]:
print("\nAlle unieke adviezen links:")
for link in adviesURLS:
    print(link)

print(f"\nTotaal aantal unieke adviezen-links: {len(adviesURLS)}")


Alle unieke adviezen links:
https://www.gezondheidsraad.nl/documenten/2024/12/19/advies-rijgeschiktheid-na-het-doormaken-van-een-psychose
https://www.gezondheidsraad.nl/documenten/2024/12/17/advies-inzet-van-vernieuwde-typen-griepvaccins-in-het-npg
https://www.gezondheidsraad.nl/documenten/2024/12/09/advies-zicht-op-gehoorverlies-bij-kinderen-en-jongeren
https://www.gezondheidsraad.nl/documenten/2024/12/03/advies-aanpassing-kwalificatie-voedingsnorm-voor-selenium
https://www.gezondheidsraad.nl/documenten/2024/12/03/advies-voedingsnormen-voor-vitamines-en-mineralen-voor-lacterende-vrouwen
https://www.gezondheidsraad.nl/documenten/2024/12/03/evaluatie-voedingsnormen-voor-vitamines-en-mineralen-voor-lacterende-vrouwen
https://www.gezondheidsraad.nl/documenten/2024/09/30/advies-vaccinatie-werknemers-kinkhoest
https://www.gezondheidsraad.nl/documenten/2024/09/10/advies-respirabel-kristallijn-silica
https://www.gezondheidsraad.nl/documenten/2024/09/03/advies-koolmonoxide
https://www.gezondh

In [5]:
res = []
for url in adviesURLS:
    client = OpenAI()

    prompt = f"""
    Lees dit document: {url}

    Vul de onderstaande JSON in volgens https://github.com/wooverheid/WoogleDocumentatie/tree/master/SPEC%20MetadataSchema:

    {{
        "dc_title": "",
        "foi_publishedDate": "",
        "dc_description": ""
    }}

    Regels:
    - Alleen écht gevonden info invullen.
    - Max 100 woorden in dc_description.
    - Ontbreekt iets? dan "" laten.
    - datum in jjjj-mm-dd
    - Output = alleen geldige JSON niks anders.
    """
    response = client.responses.create(
        model="gpt-5.1",
        input=prompt,
        tools=[{"type": "web_search"}],
        max_output_tokens=800,
    )
    
    json_str = response.output_text.strip()

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError:
        print("Kon JSON niet parsen voor URL:", url)
        print("Ruwe output was:\n", json_str)
        continue

    data["dc_source"] = url
    data["dc_publisher"] = "oorg10218"
    data["dc_type"] = "2e"
        
    res.append(data)
    
    print(json.dumps(data, ensure_ascii=False, indent=2))
    
    time.sleep(12)

with open("GR_adviesScraper_data1.json", "w", encoding="utf-8") as f:
    json.dump(res, f, ensure_ascii=False, indent=2)
    
print("\nAlle JSON-data opgeslagen in GR_adviesScraper_data1.json")

{
  "dc_title": "Advies Rijgeschiktheid na het doormaken van een psychose",
  "foi_publishedDate": "2024-12-19",
  "dc_description": "Adviesdocument van de Gezondheidsraad over rijgeschiktheid na het doormaken van een psychose.",
  "dc_source": "https://www.gezondheidsraad.nl/documenten/2024/12/19/advies-rijgeschiktheid-na-het-doormaken-van-een-psychose",
  "dc_publisher": "oorg10218",
  "dc_type": "2e"
}
{
  "dc_title": "Advies Inzet van vernieuwde typen griepvaccins in het NPG",
  "foi_publishedDate": "2024-12-17",
  "dc_description": "",
  "dc_source": "https://www.gezondheidsraad.nl/documenten/2024/12/17/advies-inzet-van-vernieuwde-typen-griepvaccins-in-het-npg",
  "dc_publisher": "oorg10218",
  "dc_type": "2e"
}
{
  "dc_title": "Advies Zicht op gehoorverlies bij kinderen en jongeren",
  "foi_publishedDate": "2024-12-09",
  "dc_description": "Adviesdocument van de Gezondheidsraad over zicht op gehoorverlies bij kinderen en jongeren. Het advies, oorspronkelijk gepubliceerd op 9 dece

In [6]:
with open("GR_adviesScraper_data1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Aantal items in JSON bestand: " + str(len(data)))

Aantal items in JSON bestand: 188


In [7]:
res2 = []

with open("GR_adviesScraper_data1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def retrieveLinksPagina2(driver, urltje):
    resje = {}
    driver.get(urltje)

    downloads = driver.find_elements(By.CSS_SELECTOR, "ul.download-list li.download-list__item")
    li_elements = driver.find_elements( By.CSS_SELECTOR, "#bijlagen ul.links-list li.links-list__item" )

    for download in downloads:
        title = download.find_element(By.CSS_SELECTOR, "span.title").text.strip()
        url = download.find_element(By.CSS_SELECTOR, "a.download-list__link").get_attribute("href")
        resje[title] = url

    for li in li_elements:
        title = li.find_element( By.CSS_SELECTOR, "span.link-plus__title" ).text.strip()
        url = li.find_element( By.CSS_SELECTOR, "a.link-plus" ).get_attribute("href")
        resje[title] = url

    try:
        knop = driver.find_element(By.CSS_SELECTOR, "a.button--cta")
        knoptitle = knop.text.strip()
        knopurl = knop.get_attribute("href")
        resje[knoptitle] = knopurl
    except NoSuchElementException:
        pass
    
    return resje    

driver = beginSelenium()

for item in data:
    dc_source2 = item.get("dc_source")

    urltjes = retrieveLinksPagina2(driver, dc_source2)

    foi_files = []
    for key, value in urltjes.items():
        if not value:
            continue
        foi_files.append({
            "foi_title": key,
            "foi_source": value,
        })

    res2.append({
        "dc_title": item.get("dc_title"),
        "foi_publishedDate": item.get("foi_publishedDate"),
        "dc_description": item.get("dc_description"),
        "dc_source": dc_source2,
        "dc_publisher": item.get("dc_publisher"),
        "dc_type": item.get("dc_type"),
        "foi_files": foi_files
    })

    time.sleep(5)

driver.quit()

with open("GR_adviesScraper_data2.json", "w", encoding="utf-8") as f:
    json.dump(res2, f, ensure_ascii=False, indent=2)

print("\nAlle JSON-data opgeslagen in ROB_adviesScraper_data2.json")


Alle JSON-data opgeslagen in ROB_adviesScraper_data2.json


In [8]:
with open("GR_adviesScraper_data2.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Aantal items in JSON bestand: " + str(len(data)))

Aantal items in JSON bestand: 188
