# MER_AdviesScraper

Doel: MER adviezen uitgebracht vanaf 2019 tot en met 2024 verzamelen.

Toepassing: Bachelor-scriptie Informatiekunde, Universiteit van Amsterdam

Contact: mitchell.malaihollo@student.uva.nl

Metadata Formatering volgens de [Woogle metadata specificatie](https://github.com/wooverheid/WoogleDocumentatie/tree/master/SPEC%20MetadataSchema) 

In [2]:
from openai import OpenAI
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from dotenv import load_dotenv
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
BASE_URL = "https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n={start}"

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) "
    "Gecko/20100101 Firefox/124.0 "
    "AdviescollegeScraper (+mailto:14605120@student.uva.nl; "
    "voor BSc-scriptie Informatiekunde UvA; "
    "doel: informatie inwinnen over verschillende adviescolleges)"
)

env_path = r"C:\Users\31633\OneDrive\Bureaublad\OpenAI_API_Key.env"  
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")
print("Key geladen:", api_key is not None)

Key geladen: True


In [4]:
def beginSelenium():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

def retrieveLinksPagina(driver, pagina_start):
    """Haalt alle adviezen op van één pagina (n = pagina_start)."""
    url = BASE_URL.format(start=pagina_start)
    print(f"n={pagina_start}: {url}")

    driver.get(url)
    time.sleep(2)

    anchors = driver.find_elements(By.TAG_NAME, "a")
    links_op_pagina = set()

    for a in anchors:
        href = a.get_attribute("href")
        if href:
            volledige_url = urljoin(url, href)
            if "https://www.commissiemer.nl/adviezen/" in volledige_url:
                links_op_pagina.add(volledige_url)
        time.sleep(5)

    print(f" -> gevonden {len(links_op_pagina)} adviezen op deze pagina.")
    return links_op_pagina

if __name__ == "__main__":
    adviezen_links = set()

    driver = beginSelenium()

    try:
        for x in range(1, 551, 10):
            links_op_pagina = retrieveLinksPagina(driver, x)
            adviezen_links.update(links_op_pagina)
            time.sleep(5)

    finally:
        driver.quit()

n=1: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=1
 -> gevonden 10 adviezen op deze pagina.
n=11: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=11
 -> gevonden 10 adviezen op deze pagina.
n=21: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=21
 -> gevonden 10 adviezen op deze pagina.
n=31: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=31
 -> gevonden 10 adviezen op deze pagina.
n=41: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=41
 -> gevonden 10 adviezen op deze pagina.
n=51: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=51
 -> gevonden 10 adviezen op deze pagina.
n=61: https://www.commissiemer.nl/adviezen?it=fp&van=01-01-2019&tot=31-12-2024&status=afgerond&n=61
 -> gevonden 10 adviezen op deze pagina.
n=71: https://w

In [None]:
print("\nAlle unieke adviezen links:")
for link in sorted(adviezen_links):
    print(link)

print(f"\nTotaal aantal unieke adviezen links: {len(adviezen_links)}")

In [4]:
res = []
for url in alle_jaren_urls:
    client = OpenAI()

    prompt = f"""
    Lees dit document: {url}

    Vul de onderstaande JSON in volgens https://github.com/wooverheid/WoogleDocumentatie/tree/master/SPEC%20MetadataSchema:

    {{
        "dc_title": "",
        "foi_publishedDate": "",
        "dc_description": ""
    }}

    Regels:
    - Alleen écht gevonden info invullen.
    - Max 100 woorden in dc_description.
    - Ontbreekt iets? dan "" laten.
    - datum in jjjj-mm-dd
    - Output = alleen geldige JSON niks anders.
    """
    response = client.responses.create(
        model="gpt-5.1",
        input=prompt,
        tools=[{"type": "web_search"}],
        max_output_tokens=800,
    )
    
    json_str = response.output_text.strip()

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError:
        print("Kon JSON niet parsen voor URL:", url)
        print("Ruwe output was:\n", json_str)
        continue

    data["dc_source"] = url
    data["dc_publisher"] = "oorg10196"
    data["dc_type"] = "2e"
        
    res.append(data)
    
    print(json.dumps(data, ensure_ascii=False, indent=2))
    
    time.sleep(12)

with open("MER_adviesScraper_data1.json", "w", encoding="utf-8") as f:
    json.dump(res, f, ensure_ascii=False, indent=2)
    
print("\nAlle JSON-data opgeslagen in MER_adviesScraper_data1.json")

NameError: name 'alle_jaren_urls' is not defined

In [5]:
with open("MER_adviesScraper_data1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Aantal items in JSON bestand: " + str(len(data)))

Aantal items in JSON bestand: 546


In [6]:
res2 = []
from datetime import datetime

with open("MER_adviesScraper_data1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def beginSelenium():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

def retrieveLinksPagina2(driver, urltje):
    resje = []
    driver.get(urltje)

    downloads = driver.find_elements(By.CSS_SELECTOR, "article#article a.download")

    for download in downloads:
        title = download.text.strip()
        url = download.get_attribute("href")
        if url:  # meteen al filteren
            resje.append({
                "foi_title": title,
                "foi_source": url
            })

    return resje

res2 = []
driver = beginSelenium()

for item in data:
    datum = item.get("foi_publishedDate")
    dc_source2 = item.get("dc_source")
    print(dc_source2)

    # altijd initialiseren zodat je geen oude waarde hergebruikt
    foi_files = []

    if datum:
        jaar = datetime.strptime(datum, "%Y-%m-%d").year
        if jaar in [2019, 2020, 2021, 2022, 2023, 2024]:
            foi_files = retrieveLinksPagina2(driver, dc_source2)

    print("aantal files: " + str(len(foi_files)))

    res2.append({
        "dc_title": item.get("dc_title"),
        "foi_publishedDate": datum,
        "dc_description": item.get("dc_description"),
        "dc_source": dc_source2,
        "dc_publisher": item.get("dc_publisher"),
        "dc_type": item.get("dc_type"),
        "foi_files": foi_files
    })
    print("-------")

    time.sleep(5)

driver.quit()

with open("MER_adviesScraper_data2.json", "w", encoding="utf-8") as f:
    json.dump(res2, f, ensure_ascii=False, indent=2)

print("\nAlle JSON-data opgeslagen in MER_adviesScraper_data2.json")

https://www.commissiemer.nl/adviezen/3696
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3736
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3386
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3538
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3635
aantal files: 4
-------
https://www.commissiemer.nl/adviezen/3651
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3370
aantal files: 4
-------
https://www.commissiemer.nl/adviezen/3731
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3541
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3738
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/3745
aantal files: 2
-------
https://www.commissiemer.nl/adviezen/2854
aantal files: 5
-------
https://www.commissiemer.nl/adviezen/3629
aantal files: 4
-------
https://www.commissiemer.nl/adviezen/3374
aantal files: 4
-------
https://www.commissiemer.nl/adviezen/3554
aantal files: 4
-------
https://ww

In [7]:
with open("MER_adviesScraper_data2.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Aantal items in JSON bestand: " + str(len(data)))

Aantal items in JSON bestand: 546
