In [23]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import json

## Step 1: get all topics

In [5]:
url = "https://www.rijksoverheid.nl/onderwerpen"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

topics = {}

for link in soup.find_all('a', href=True):
    href = link['href']
    if href.startswith('/onderwerpen/'):
        topic_name = href.split('/')[-1]
        full_link = f"https://www.rijksoverheid.nl{href}"
        topics[topic_name] = {
            "source": full_link
        }

In [6]:
topics

{'coronavirus-vaccinatie': {'source': 'https://www.rijksoverheid.nl/onderwerpen/coronavirus-vaccinatie'},
 'schoolvakanties': {'source': 'https://www.rijksoverheid.nl/onderwerpen/schoolvakanties'},
 'koopkracht': {'source': 'https://www.rijksoverheid.nl/onderwerpen/koopkracht'},
 'minimumloon': {'source': 'https://www.rijksoverheid.nl/onderwerpen/minimumloon'},
 'arbeidsovereenkomst-en-cao': {'source': 'https://www.rijksoverheid.nl/onderwerpen/arbeidsovereenkomst-en-cao'},
 'belastingen-uitkeringen-en-toeslagen': {'source': 'https://www.rijksoverheid.nl/onderwerpen/themas/belastingen-uitkeringen-en-toeslagen'},
 'aanpak-belastingontwijking-en-belastingontduiking': {'source': 'https://www.rijksoverheid.nl/onderwerpen/aanpak-belastingontwijking-en-belastingontduiking'},
 'algemene-nabestaandenwet-anw': {'source': 'https://www.rijksoverheid.nl/onderwerpen/algemene-nabestaandenwet-anw'},
 'algemene-ouderdomswet-aow': {'source': 'https://www.rijksoverheid.nl/onderwerpen/algemene-ouderdomswe

## Step 2: get all questions for each topic

In [9]:
for topic_name in tqdm(topics.keys(), desc="Processing topics"):
    json_url = f"https://opendata.rijksoverheid.nl/v1/infotypes/faq/subjects/{topic_name}?rows=200&output=json"
    json_response = requests.get(json_url)
    
    if json_response.status_code == 200:
        topics[topic_name]["questions"] = json_response.json()
    else:
        topics[topic_name]["questions"] = "No data available or error occurred"

    time.sleep(1)

Processing topics: 100%|██████████| 355/355 [08:08<00:00,  1.38s/it]


In [10]:
topics

{'coronavirus-vaccinatie': {'source': 'https://www.rijksoverheid.nl/onderwerpen/coronavirus-vaccinatie',
  'questions': []},
 'schoolvakanties': {'source': 'https://www.rijksoverheid.nl/onderwerpen/schoolvakanties',
  'questions': [{'id': '62bdc113-aee1-4469-bc36-4c7c3fc39a4d',
    'type': 'vraag en antwoord',
    'canonical': 'https://www.rijksoverheid.nl/onderwerpen/leerplicht/vraag-en-antwoord/leerplicht-schoolvakanties',
    'dataurl': 'https://opendata.rijksoverheid.nl/v1/infotypes/faq/62bdc113-aee1-4469-bc36-4c7c3fc39a4d',
    'question': 'Mag ik mijn kind meenemen op vakantie buiten de schoolvakantie?',
    'lastmodified': '2023-05-31T13:52:55.841Z'}]},
 'koopkracht': {'source': 'https://www.rijksoverheid.nl/onderwerpen/koopkracht',
  'questions': [{'id': '7b771e48-4407-497e-b5da-8d9cf7f050b8',
    'type': 'vraag en antwoord',
    'canonical': 'https://www.rijksoverheid.nl/onderwerpen/belasting-betalen/vraag-en-antwoord/accijns-betalen',
    'dataurl': 'https://opendata.rijksove

## Step 3: Get all answers for each topic

In [19]:
# Flatten the list of questions across all topics
all_questions = [(topic_name, q) for topic_name, topic in topics.items() for q in topic.get('questions', []) if type(q) == dict and 'dataurl' in q]

# Single progress bar for all questions
for topic_name, question in tqdm(all_questions, desc="Fetching answers for all questions"):
    json_url = f"{question['dataurl']}?output=json"
    json_response = requests.get(json_url)
    
    if json_response.status_code == 200:
        question["answer"] = json_response.json()
    else:
        
        print(f"Failed to fetch answer for topic '{topic_name}', question '{question['question']}'")

    time.sleep(0.5)  # Wait for 0.5 second before the next request

Fetching answers for all questions: 100%|██████████| 2597/2597 [29:08<00:00,  1.49it/s]  


## Step 4: Save all data

In [27]:
with open("./../data/qa-data.json", 'w') as output_file:
    json.dump(list(map(lambda question: question[1], all_questions)), output_file)

In [30]:
with open("./../data/topic-data.json", 'w') as output_file:
    json.dump(topics, output_file)