The Belgian Federal Parliament (De Kamer) provides an overview of all written questions of members of parliament to the ministers. There is no API. The questions and answers are available through pdfs or html pages. 

# Setting up

In [1]:
# show all outputs of cell, not merely of last line (i.e. default of Jupyter Notebook)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [74]:
import os
import requests
from bs4 import BeautifulSoup
# import fitz  # PyMuPDF
import pandas as pd

import re

from collections import Counter

import pickle

In [3]:
# Set base_url of "Bulletins vragen en antwoorden - Zittingsperiode : 55"
base_url = "https://www.dekamer.be/kvvcr/showpage.cfm?section=/qrva&language=nl&cfm=qrvaList.cfm"

# Extract various Bulletins

First we create a function to obtain the urls of all so-called Bulletins in which the Federal Parliament provides the individual questions. 

In [4]:
def scrape_list_bulletins(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        bulletin_urls = []
    
        for link in soup.find_all('a', href=True):
            if link['href'].startswith('showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat'):
                bulletin_url = f"https://www.dekamer.be/kvvcr/{link['href']}"

                # print(bulletin_url)

                bulletin_urls.append(bulletin_url)
            
                # question_response = requests.get(question_url)
                # question_soup = BeautifulSoup(question_response.text, 'html.parser')
                
                # question_info = extract_question_info(question_soup)
                # bulletin_urls.append(question_info)

        # Remove duplicates, since urls are 2 times shown
        bulletin_urls = sorted(set(bulletin_urls))
        
        return bulletin_urls



    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")


In [5]:
# Define url of page with overview of urls of bulletins
url_main_page = "https://www.dekamer.be/kvvcr/showpage.cfm?section=/qrva&language=nl&cfm=qrvaList.cfm"

# Obtain urls
bulletin_urls_main_page = scrape_list_bulletins(url_main_page)

# Inspect results
bulletin_urls_main_page

['https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B001',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B002',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B003',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B004',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B005',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B006',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B007',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B008',
 'https://www.dekamer.be/kvvcr/showpage.cfm?&lan

# Extract relevant information for each question

Each Bulletin contains various questions. The html page of the Bulletin provides for each question some information, which can extract:
* Auteur (i.e. author, the member of parliament posing the question)
* Departement (i.e. the minister to whom the question is directed)
* Titel (i.e. subject of the question)
* Datum ingediend (i.e. the date when the question was asked)
* Antwoord gepubliceerd (i.e. the hyperlink to the answer to the question)

* To obtain these elements, we create various functions. 

Then we obtain for each of those bulletins all questions and the relevant information for these questions. For this we use a helper function to extract the author, relevant minister and subject of the question. The html structure is not so clear (i.e. not all relevant elements are contained within a single container, so using specific anchors does not suffice). Later on the date the question was asked, as well as wether an answer was provided before publishing the question or not can be obtained. 

In [6]:
def split_author(input_string):
    """
    Function to split strings as obtained from html page into name of member, his/her party and id of question

    e.g. 
    'Anneleen\n      Van Bossuyt,\n      N-VA (07354)'
    ('Anneleen Van Bossuyt', ' N-VA ', '07354')
    """
    # Remove unnecessary characters (newlines and bracket at end (of id number))
    cleaned_string = input_string.replace("\n", "").rstrip(")")
    
    # Replace any sequence of spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string)

    # split on comma (between name and party) and left bracket (between party and id number)
    name, party, id_number = re.split(r',|\(', cleaned_string)

    return name, party, id_number

In [7]:
# WORKABLE CODE ##

def scrape_bulletin(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        question_containers_0 = soup.find_all('div', class_='linklist_0')
        question_containers_1 = soup.find_all('div', class_='linklist_1')

        entries = []
        
        for question_container in question_containers_0 + question_containers_1:


            tr_elements = question_container.find_all('tr')
            # print("----", tr_elements)

            # Initialize variables for additional information
            author, party_author, id_question, department, title = "N/A", "N/A", "N/A", "N/A", "N/A"

            for tr_element in tr_elements:

                # print(tr_element)
                # print("**********")
                
                td_elements = tr_element.find_all('td', class_='txt')

                
                
                if len(td_elements) == 2:
                    label, value = td_elements[0].text.strip(), td_elements[1].text.strip()

                    # print(label)
                    # print("****")
                    # print(value)
                    # print("***************")

                    if "Auteur" in label:
                        # Split string on about using dedicatd function
                        author, party_author, id_question = split_author(value)
                    elif "Departement" in label:
                        department = value
                    elif "Titel" in label:
                        title = value
                    # elif "Datum indiening" in label:
                    #     date_questions = value
                    # elif "Antwoord gepubliceerd" in label:
                    #     # Extract the URL if available
                    #     answer_published = td_elements[1].find('a')['href'] if td_elements[1].find('a') else "N/A"




            # # Print or store the extracted information
            # print(f"Auteur: {author}")
            # print(f"Departement: {department}")
            # print(f"Titel: {title}")
            # print(f"Datum vraag: {date_questions}")
            # print("----")

            if not author == party_author == id_question == department == title == 'N/A':
                entries.append([id_question, author, party_author, department, title])

        return entries
                

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")



In [8]:
# # limit amounf of pages for testing
# bulletin_urls_main_page = bulletin_urls_main_page[:3]

In [9]:
# Initialize list with details on all questions
questions_all = []
# iterate over all urls of bulletin
for index, url in enumerate(bulletin_urls_main_page):
    # Obtain information of all questions in relevant bulletin
    questions_per_bulletin = scrape_bulletin(url)
    questions_all.extend(questions_per_bulletin) # extend to overall list (no appending to avoid nesting)
    print(f"Aantal pagina's aan vragen verwerkt: {index + 1}.") # Show progress

Aantal pagina's aan vragen verwerkt: 1.
Aantal pagina's aan vragen verwerkt: 2.
Aantal pagina's aan vragen verwerkt: 3.
Aantal pagina's aan vragen verwerkt: 4.
Aantal pagina's aan vragen verwerkt: 5.
Aantal pagina's aan vragen verwerkt: 6.
Aantal pagina's aan vragen verwerkt: 7.
Aantal pagina's aan vragen verwerkt: 8.
Aantal pagina's aan vragen verwerkt: 9.
Aantal pagina's aan vragen verwerkt: 10.
Aantal pagina's aan vragen verwerkt: 11.
Aantal pagina's aan vragen verwerkt: 12.
Aantal pagina's aan vragen verwerkt: 13.
Aantal pagina's aan vragen verwerkt: 14.
Aantal pagina's aan vragen verwerkt: 15.
Aantal pagina's aan vragen verwerkt: 16.
Aantal pagina's aan vragen verwerkt: 17.
Aantal pagina's aan vragen verwerkt: 18.
Aantal pagina's aan vragen verwerkt: 19.
Aantal pagina's aan vragen verwerkt: 20.
Aantal pagina's aan vragen verwerkt: 21.
Aantal pagina's aan vragen verwerkt: 22.
Aantal pagina's aan vragen verwerkt: 23.
Aantal pagina's aan vragen verwerkt: 24.
Aantal pagina's aan vrage

In [100]:
# Turn list into dataframe
questions_df = pd.DataFrame(questions_all,
                            columns = ["ID vraag", "Parlementslid", "Partij parlementslid", "Minister (bevoegdheden)", "Onderwerp"]
                           )

In [101]:
# # Inspect results
# questions_all
questions_df

questions_df["Partij parlementslid"].value_counts()

Unnamed: 0,ID vraag,Parlementslid,Partij parlementslid,Minister (bevoegdheden),Onderwerp
0,07354,Anneleen Van Bossuyt,N-VA,"Minister van Mobiliteit, belast met Belgocontr...",Toegankelijkheid Sint-Pietersstation Gent voor...
1,01201,Leen Dierick,CD&V,"Minister van Werk, Economie en Consumenten, be...",Verzekering van werktuigen die deelnemen aan h...
2,01207,Kattrin Jadin,MR,"Minister van Werk, Economie en Consumenten, be...",Digitalisering van de Belgische economie - Stu...
3,01131,Jef Van den Bergh,CD&V,"Minister van Digitale Agenda, Telecommunicatie...",Mobiele dekkingsgraad.
4,06498,Wouter Raskin,N-VA,"Minister van Digitale Agenda, Telecommunicatie...",Limosa-aangiftes.
...,...,...,...,...,...
32590,07244,Samuel Cogolati,Ecolo-Groen,Vice-eersteminister en Minister van Economie e...,Toenemend onevenwicht op de handelsbalans met ...
32591,07022,Michael Freilich,N-VA,Vice-eersteminister en Minister van Economie e...,Contacten tussen mevrouw Hanard en uw kabinets...
32592,07075,Tom Van Grieken,VB,Vice-eersteminister en Minister van Economie e...,Kosten consultancy.
32593,06383,Philippe Goffin,MR,Vice-eersteminister en Minister van Economie e...,Risico op een voedselcrisis ten gevolge van de...


Partij parlementslid
 VB              7352
 N-VA            6365
 MR              4534
 CD&V            2810
 Ecolo-Groen     2608
 PS              2067
 PVDA-PTB        1847
 Open Vld        1479
 Voorui           887
 cd&v             642
 sp.a             622
 INDEP            440
 cdH              389
 LENGAG           277
 DéFI             276
Name: count, dtype: int64

Upon inspection of the results, it seems some party names are incorrect. Sometimes abbreviations are use ('VB' instead of 'Vlaams Belang') or a party changed names during the legislative periode (e.g. 'sp.a' v. 'Vooruit').

In [102]:
# Strip leading and trailing whitespace from column names
questions_df.columns = questions_df.columns.str.strip()

# Replace party names, accounting for trailing spaces
questions_df["Partij parlementslid"] = questions_df["Partij parlementslid"].str.strip().replace(['sp.a', 'Voorui'], 'Vooruit')
questions_df["Partij parlementslid"] = questions_df["Partij parlementslid"].str.strip().replace('cdH', 'LENGAG')

# Additional replacements, accounting for trailing spaces
questions_df["Partij parlementslid"] = questions_df["Partij parlementslid"].str.strip().replace({
    'LENGAG': 'Les Engagés', 'CD&V': 'cd&v', 'DéFI': 'Défi', 'VB': 'Vlaams Belang', 'INDEP': 'Onafhankelijk'})


questions_df["Partij parlementslid"].value_counts()

Partij parlementslid
Vlaams Belang    7352
N-VA             6365
MR               4534
cd&v             3452
Ecolo-Groen      2608
PS               2067
PVDA-PTB         1847
Vooruit          1509
Open Vld         1479
Les Engagés       666
Onafhankelijk     440
Défi              276
Name: count, dtype: int64

Also, the responsible minister is merely name by the competences he/she manages. It seems more user-friendly to replace this by the actual name of the minister.

**Once dates of question can be taken into account, the change in ministers can be taken into account. Currently we simply refer to the name of the current minister holding the post**

In [103]:
minister_competences_2_names_dict = {
    # Function allocated to multiple persons, unclear to distinguish based on allocated competences
    'Eerste Minister': 'Alexander De Croo',
    'Eerste Minister': 'Sophie Wilmès', # 27 oktober 2019 - 30 november 2019 (before exit Reynders) 
    'Eerste Minister': 'Charles Michel',

    # Michel II (9 december 2018 - 27 oktober 2019) (and not continued as such in Wilmès-I and / Wilmès-II)
        # See https://nl.wikipedia.org/wiki/Regering-Michel_II
    'Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid': 'Sophie Wilmès',
    'Minister van Werk, Economie en Consumenten, belast met Buitenlandse Handel, Armoedebestrijding, Gelijke Kansen en Personen met een beperking': 'Wouter Beke', # 2 juli 2019 - 2 oktober 2019 
    
    
    # Wilmès-I (27 oktober 2019 - 17 maart 2020)
        # See https://nl.wikipedia.org/wiki/Regering-Wilm%C3%A8s_I

        # Before exit Didier Reynders (27 oktober 2019 - 30 november 2019)
    'Vice-eersteminister en Minister van Buitenlandse en Europese Zaken, en van Defensie, belast met Beliris en de Federale Culturele Instellingen': 'Didier Reynders', 
    'Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid': 'David Clarinval', 
    'Vice-eersteminister en Minister van Justitie, belast met de Regie der Gebouwen': 'Koen Geens', 

        # After exit Didier Reynders: 
        # competences shifted to Wilmès, Clarinval and Geens (but remained same during Wilmès I and Wilmès II)
    
    # Wilmès-II (17 maart 2020 - 1 oktober 2020)
        # Zie https://nl.wikipedia.org/wiki/Regering-Wilm%C3%A8s_II


    # Wilmès I en Wilmès II (so no changes at 17 maart 2020 - so from 27 oktober 2019 or 30 november 2019 until 1 oktober 2020)
        # After exit Reynders (as of 30 november 2019 until 1 oktober 2020)
    'Eerste Minister, belast met Beliris en de Federale Culturele Instellingen': 'Sophie Wilmès', 
    'Vice-eersteminister en Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid': 'David Clarinval', 
    'Vice-eersteminister en Minister van Justitie, belast met de Regie der Gebouwen, en Minister van Europese Zaken': 'Koen Geens',

        # General (as of 27 oktober 2019 until 1 oktober 2020)
    'Vice-eersteminister en Minister van Financiën, belast met Bestrijding van de fiscale fraude, en Minister van Ontwikkelingszaken': 'Alexander De Croo',

    'Minister van Buitenlandse Zaken, en van Defensie': 'Philippe Goffin',
    'Minister van Digitale Agenda, Telecommunicatie en Post, belast met Administratieve Vereenvoudiging, Bestrijding van de sociale fraude, Privacy en Noordzee': 'Philippe De Backer',
    'Minister van Energie, Leefmilieu en Duurzame Ontwikkeling': 'Marie-Christine Marghem',
    "Minister van Middenstand, Zelfstandigen, Kmo's, Landbouw, en Maatschappelijke Integratie, belast met Grote Steden": 'Denis Ducarme',
    'Minister van Mobiliteit, belast met Belgocontrol en de Nationale Maatschappij der Belgische spoorwegen': 'François Bellot',
    'Minister van Pensioenen': 'Daniel Bacquelaine',
    'Minister van Sociale Zaken en Volksgezondheid, en van Asiel en Migratie': 'Maggie De Block',
    'Minister van Veiligheid en Binnenlandse Zaken': 'Pieter De Crem',
    'Minister van Werk, Economie en Consumenten, belast met Armoedebestrijding, Gelijke Kansen en Personen met een beperking': 'Nathalie Muylle', # 2 oktober 2019 - 27 oktober 2019 
     
    # Regering De Croo

    'Vice-eersteminister en Minister van Economie en Werk': 'Pierre-Yves Dermagne',
    'Vice-eersteminister en Minister van Buitenlandse Zaken, Europese Zaken en Buitenlandse Handel, en de Federale Culturele Instellingen': 'Sophie Wilmès', # 1 oktober - 14 juli 2022
    'Vice-eersteminister en Minister van Mobiliteit': 'Georges Gilkinet',
    'Vice-eersteminister en Minister van Financiën, belast met de Coördinatie van de fraudebestrijding': 'Vincent Van Peteghem',
    'Vice-eersteminister en Minister van Sociale Zaken en Volksgezondheid': 'Frank Vandenbroucke',
    'Vice-eersteminister en Minister van Ambtenarenzaken, Overheidsbedrijven, Telecommunicatie en Post': 'Petra De Sutter',
    'Vice-eersteminister en Minister van Justitie, belast met de Noordzee': 'Vincent Van Quickenborne', # 1 oktober 2020 - 22 oktober 2023
    'Vice-eersteminister en Minister van Justitie, belast met de Noordzee': 'Paul Van Tigchelt', # 22 oktober 2023 - now
    
    

    "Minister van Middenstand, Zelfstandigen, Kmo's en Landbouw, Institutionele Hervormingen en Democratische Vernieuwing": 'David Clarinval',
    "Minister van Middenstand, Zelfstandigen, Kmo's en Landbouw, Institutionele Hervormingen en Democratische Vernieuwing, belast met Buitenlandse Handel" :'David Clarinval', 
    'Minister van Pensioenen en Maatschappelijke Integratie, belast met Personen met een beperking, Armoedebestrijding en Beliris': 'Karine Lalieux',
    'Minister van Defensie': 'Ludivine Dedonder',
    'Minister van Klimaat, Leefmilieu, Duurzame Ontwikkeling en Green Deal': 'Zakia Khattabi',
    'Minister van Binnenlandse Zaken, Institutionele Hervormingen en Democratische Vernieuwing': 'Annelies Verlinden',
    'Minister van Ontwikkelingssamenwerking en Grootstedenbeleid': 'Meryame Kitir', # 1 oktober 2020 - 17 december 2022 
    'Minister van Ontwikkelingssamenwerking en Grootstedenbeleid': 'Caroline Gennez', # 17 december 2022 - now
    # 'Minister van Ontwikkelingssamenwerking, belast met Grote Steden' seems to be same as 'Minister van Ontwikkelingssamenwerking en Grootstedenbeleid'
    'Minister van Ontwikkelingssamenwerking, belast met Grote Steden': 'Meryame Kitir', # 1 oktober 2020 - 17 december 2022 
    'Minister van Ontwikkelingssamenwerking, belast met Grote Steden': 'Caroline Gennez', # 17 december 2022 - now
    'Minister van Energie': 'Tinne Van der Straeten',
    'Minister van Buitenlandse Zaken, Europese Zaken en Buitenlandse Handel, en de Federale Culturele Instellingen.': 'Hadja Lahbib', # 15 juli 2022 - now
    

    'Staatssecretaris voor Relance en Strategische Investeringen, belast met Wetenschapsbeleid, toegevoegd aan de Minister van Economie en Werk': 'Thomas Dermine',
    'Staatssecretaris voor Digitalisering, belast met Administratieve Vereenvoudiging, Privacy en de Regie der Gebouwen, toegevoegd aan de Eerste Minister': 'Mathieu Michel',
    'Staatssecretaris voor Digitalisering, belast met Administratieve Vereenvoudiging, Privacy en de Regie der Gebouwen, de Federale Culturele Instellingen, toegevoegd aan de Eerste Minister': 'Mathieu Michel',
    'Staatssecretaris voor Gendergelijkheid, Gelijke Kansen en Diversiteit, toegevoegd aan de Minister van Mobiliteit': 'Sarah Schlitz', # 1 oktober 2020 - 26 april 2023
    'Staatssecretaris voor Gendergelijkheid, Gelijke Kansen en Diversiteit, toegevoegd aan de Minister van Mobiliteit': 'Marie-Colline Leroy', # 2 mei 2023 - now
    'Staatssecretaris voor Asiel en Migratie, belast met de Nationale Loterij, toegevoegd aan de Minister van Binnenlandse Zaken, Institutionele Hervormingen en Democratische Vernieuwing': 'Sammy Mahdi', # 1 oktober 2020 - 28 juni 2022
    'Staatssecretaris voor Asiel en Migratie, toegevoegd aan de Minister van Binnenlandse Zaken, Institutionele Hervormingen en Democratische Vernieuwing': 'Nicole de Moor', # 28 juni 2022 - now # 'Nationale Loterij' now at Van Peteghem
    'Staatssecretaris voor Begroting en Consumentenbescherming, toegevoegd aan de Minister van Justitie, belast met de Noordzee': 'Eva De Bleeker', # 1 oktober 2020 - 18 november 2022 
    'Staatssecretaris voor Begroting en Consumentenbescherming, toegevoegd aan de Minister van Justitie, belast met de Noordzee': 'Alexia Bertrand', # 18 november 2022 - now
}

In [104]:
# Temporary modification of dict, until splitting by time is possible
minister_competences_2_names_dict["Eerste minister"] = 'Alexander De Croo / Sophie Wilmès / Charles Michel'
minister_competences_2_names_dict['Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid'] = 'Sophie Wilmès / David Clarinval'
minister_competences_2_names_dict['Vice-eersteminister en Minister van Justitie, belast met de Noordzee'] = 'Vincent Van Quickenborne / Paul Van Tigchelt'
minister_competences_2_names_dict['Minister van Ontwikkelingssamenwerking en Grootstedenbeleid'] = 'Meryame Kitir / Caroline Gennez'
minister_competences_2_names_dict['Minister van Ontwikkelingssamenwerking, belast met Grote Steden'] = 'Meryame Kitir / Caroline Gennez'
minister_competences_2_names_dict['Staatssecretaris voor Gendergelijkheid, Gelijke Kansen en Diversiteit, toegevoegd aan de Minister van Mobiliteit'] = 'Sarah Schlitz / Marie-Colline Leroy'
minister_competences_2_names_dict['Staatssecretaris voor Begroting en Consumentenbescherming, toegevoegd aan de Minister van Justitie, belast met de Noordzee'] = 'Eva De Bleeker / Alexia Bertrand'



In [105]:
# Assess results
minister_competences_2_names_dict.keys()

minister_competences_2_names_dict["Eerste minister"]
minister_competences_2_names_dict['Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid']
minister_competences_2_names_dict['Vice-eersteminister en Minister van Justitie, belast met de Noordzee'] 
minister_competences_2_names_dict['Minister van Ontwikkelingssamenwerking en Grootstedenbeleid']
minister_competences_2_names_dict['Minister van Ontwikkelingssamenwerking, belast met Grote Steden']
minister_competences_2_names_dict['Staatssecretaris voor Gendergelijkheid, Gelijke Kansen en Diversiteit, toegevoegd aan de Minister van Mobiliteit'] 
minister_competences_2_names_dict['Staatssecretaris voor Begroting en Consumentenbescherming, toegevoegd aan de Minister van Justitie, belast met de Noordzee']

dict_keys(['Eerste Minister', 'Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid', 'Minister van Werk, Economie en Consumenten, belast met Buitenlandse Handel, Armoedebestrijding, Gelijke Kansen en Personen met een beperking', 'Vice-eersteminister en Minister van Buitenlandse en Europese Zaken, en van Defensie, belast met Beliris en de Federale Culturele Instellingen', 'Vice-eersteminister en Minister van Justitie, belast met de Regie der Gebouwen', 'Eerste Minister, belast met Beliris en de Federale Culturele Instellingen', 'Vice-eersteminister en Minister van Begroting en van Ambtenarenzaken, belast met de Nationale Loterij en Wetenschapsbeleid', 'Vice-eersteminister en Minister van Justitie, belast met de Regie der Gebouwen, en Minister van Europese Zaken', 'Vice-eersteminister en Minister van Financiën, belast met Bestrijding van de fiscale fraude, en Minister van Ontwikkelingszaken', 'Minister van Buitenlandse Zaken, en van Defensi

'Alexander De Croo / Sophie Wilmès / Charles Michel'

'Sophie Wilmès / David Clarinval'

'Vincent Van Quickenborne / Paul Van Tigchelt'

'Meryame Kitir / Caroline Gennez'

'Meryame Kitir / Caroline Gennez'

'Sarah Schlitz / Marie-Colline Leroy'

'Eva De Bleeker / Alexia Bertrand'

In [106]:
# map names to positions using dict
questions_df["Minister"] = questions_df["Minister (bevoegdheden)"].map(minister_competences_2_names_dict)

In [107]:
# Inspect results (include check to ensure that names assigned for all posts / competences
questions_df.head()
questions_df[questions_df["Minister"].isna()]
questions_df["Minister"].unique()

Unnamed: 0,ID vraag,Parlementslid,Partij parlementslid,Minister (bevoegdheden),Onderwerp,Minister
0,7354,Anneleen Van Bossuyt,N-VA,"Minister van Mobiliteit, belast met Belgocontr...",Toegankelijkheid Sint-Pietersstation Gent voor...,François Bellot
1,1201,Leen Dierick,cd&v,"Minister van Werk, Economie en Consumenten, be...",Verzekering van werktuigen die deelnemen aan h...,Wouter Beke
2,1207,Kattrin Jadin,MR,"Minister van Werk, Economie en Consumenten, be...",Digitalisering van de Belgische economie - Stu...,Wouter Beke
3,1131,Jef Van den Bergh,cd&v,"Minister van Digitale Agenda, Telecommunicatie...",Mobiele dekkingsgraad.,Philippe De Backer
4,6498,Wouter Raskin,N-VA,"Minister van Digitale Agenda, Telecommunicatie...",Limosa-aangiftes.,Philippe De Backer


Unnamed: 0,ID vraag,Parlementslid,Partij parlementslid,Minister (bevoegdheden),Onderwerp,Minister


array(['François Bellot', 'Wouter Beke', 'Philippe De Backer',
       'Denis Ducarme', 'Marie-Christine Marghem', 'Maggie De Block',
       'Pieter De Crem', 'Didier Reynders', 'Alexander De Croo',
       'Koen Geens', 'Charles Michel', 'Sophie Wilmès / David Clarinval',
       'Daniel Bacquelaine', 'Nathalie Muylle', 'Philippe Goffin',
       'David Clarinval', 'Sophie Wilmès', 'Ludivine Dedonder',
       'Eva De Bleeker / Alexia Bertrand', 'Annelies Verlinden',
       'Vincent Van Quickenborne / Paul Van Tigchelt', 'Petra De Sutter',
       'Vincent Van Peteghem', 'Georges Gilkinet', 'Pierre-Yves Dermagne',
       'Zakia Khattabi', 'Frank Vandenbroucke', 'Sammy Mahdi',
       'Sarah Schlitz / Marie-Colline Leroy', 'Mathieu Michel',
       'Tinne Van der Straeten', 'Karine Lalieux', 'Thomas Dermine',
       'Meryame Kitir / Caroline Gennez', 'Hadja Lahbib',
       'Nicole de Moor'], dtype=object)

In [108]:
## Save details_questions_term_df for later use
# 1. Save as pkl
with open('../data/federal_details_questions_df.pkl', 'wb') as file:
    pickle.dump(questions_df, file)

# 2. Save as csv
questions_df.to_csv(path_or_buf = '../data/federal_details_questions_df.csv',
                               sep = ";",
                               encoding = "utf-16", # to ensure trema's are well handled (e.g. Koen Daniëls)
                               index = False)

# Dump

In [None]:
# Check for duplicate 'keys' (i.e. when exact same position held by multiple people
# Find duplicate keys
# duplicate_keys = [key for key, count in Counter(minister_competences_2_names_dict).items() if count > 1]
# duplicate_keys
# Convert dictionary keys into a list
keys_list = list(minister_competences_2_names_dict.keys())

# Find duplicate keys using a set comprehension
duplicate_keys = {key for key in keys_list if keys_list.count(key) > 1}
duplicate_keys

In [None]:
temp = questions_df.loc[0]["Parlementslid"]
temp

In [None]:
def split_author(input_string):
    """
    Function to split strings as obtained from html page into name of member, his/her party and id of question

    e.g. 
    'Anneleen\n      Van Bossuyt,\n      N-VA (07354)'
    ('Anneleen Van Bossuyt', ' N-VA ', '07354')
    """
    # Remove unnecessary characters (newlines and bracket at end (of id number))
    cleaned_string = input_string.replace("\n", "").replace("\)", "")
    
    # Replace any sequence of spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', input_string)

    # split on comma (between name and party) and left bracket (between party and id number)
    name, party, id_number = re.split(r',|\(', cleaned_string)

    return name, party, id_number

In [None]:
name, party, id_nr = split_author(temp)
name, party, id_nr 

In [None]:
questions = scrape_bulletin(url)

In [None]:
# Functiont to extract various elements stored in Beautiful Soup tags
def extract_question_info(question_soup):
    author = question_soup.find('td', text='Auteur').find_next('td').text.strip()
    department = question_soup.find('td', text='Departement').find_next('td').text.strip()
    title = question_soup.find('td', text='Titel').find_next('td').text.strip()
    date_requested = question_soup.find('td', text='Datum indiening').find_next('td').text.strip()
    answer_published = question_soup.find('td', text='Antwoord gepubliceerd').find_next('a')['href']
    
    return {
        'Auteur': author,
        'Departement': department,
        'Title': title,
        'Datum indiening': date_requested,
        'Antwoord gepubliceerd': answer_published
    }



In [None]:
url_b125 = "https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B125"

In [None]:
response = requests.get(url_b125)
soup = BeautifulSoup(response.text, 'html.parser')

# destination_folder = "pdfs"
# os.makedirs(destination_folder, exist_ok=True)

question_data = []

In [None]:
soup

In [None]:
question_data = []

for link in soup.find_all('a', href=True):
    if link['href'].startswith('showpage.cfm?section=qrva&language=nl&cfm=qrvaXml.cfm?legislat='):'
        question_url = f"https://www.dekamer.be/kvvcr/{link['href']}"
        question_response = requests.get(question_url)
        question_soup = BeautifulSoup(question_response.text, 'html.parser')
        
        question_info = extract_question_info(question_soup)
        question_data.append(question_info)

In [None]:
# # WORKABLE CODE ##

# def scrape_bulletin(url):
#     response = requests.get(url)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         question_containers_0 = soup.find_all('div', class_='linklist_0')
#         question_containers_1 = soup.find_all('div', class_='linklist_1')

#         entries = []
        
#         for question_container in question_containers_0 + question_containers_1:
#         # for question_container in question_containers_1:
#             # title_element = question_container.find('a', class_='question-hyperlink')
#             # author_element = question_container.find('div', class_='user-details').a
#             # views_element = question_container.find('div', class_='views')

#             tr_elements = question_container.find_all('tr')
#             print("----", tr_elements)

#             # Initialize variables for additional information
#             author, department, title, date_questions = "N/A", "N/A", "N/A", "N/A"

#             for tr_element in tr_elements:
#                 td_elements = tr_element.find_all('td', class_='txt')
                
#                 if len(td_elements) == 2:
#                     label, value = td_elements[0].text.strip(), td_elements[1].text.strip()

#                     if "Auteur" in label:
#                         author = value
#                     elif "Departement" in label:
#                         department = value
#                     elif "Titel" in label:
#                         title = value
#                     elif "Datum indiening" in label:
#                         date_questions = value
#                     elif "Antwoord gepubliceerd" in label:
#                         # Extract the URL if available
#                         answer_published = td_elements[1].find('a')['href'] if td_elements[1].find('a') else "N/A"




#             # # Print or store the extracted information
#             # print(f"Auteur: {author}")
#             # print(f"Departement: {department}")
#             # print(f"Titel: {title}")
#             # print(f"Datum vraag: {date_questions}")
#             # print("----")

#             if not author == department == title == date_questions == 'N/A':
#                 entries.append([author, department, title, date_questions])

#         return entries
                

#     else:
#         print(f"Failed to retrieve the page. Status code: {response.status_code}")



In [None]:
# WORKABLE CODE ##

def scrape_bulletin(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        question_containers_0 = soup.find_all('div', class_='linklist_0')
        question_containers_1 = soup.find_all('div', class_='linklist_1')

        entries = []
        
        for question_container in question_containers_0 + question_containers_1:
        # for question_container in question_containers_1:
            # title_element = question_container.find('a', class_='question-hyperlink')
            # author_element = question_container.find('div', class_='user-details').a
            # views_element = question_container.find('div', class_='views')

            tr_elements = question_container.find_all('tr')
            print("----", tr_elements)

            # Initialize variables for additional information
            author, department, title, date_questions = "N/A", "N/A", "N/A", "N/A"

            for tr_element in tr_elements:

                # print(tr_element)
                # print("**********")
                
                td_elements = tr_element.find_all('td', class_='txt')

                
                
                if len(td_elements) == 2:
                    label, value = td_elements[0].text.strip(), td_elements[1].text.strip()

                    # print(label)
                    # print("****")
                    # print(value)
                    # print("***************")

                    if "Auteur" in label:
                        author = value
                    elif "Departement" in label:
                        department = value
                    elif "Titel" in label:
                        title = value
                    elif "Datum indiening" in label:
                        date_questions = value
                    elif "Antwoord gepubliceerd" in label:
                        # Extract the URL if available
                        answer_published = td_elements[1].find('a')['href'] if td_elements[1].find('a') else "N/A"




            # # Print or store the extracted information
            # print(f"Auteur: {author}")
            # print(f"Departement: {department}")
            # print(f"Titel: {title}")
            # print(f"Datum vraag: {date_questions}")
            # print("----")

            if not author == department == title == date_questions == 'N/A':
                entries.append([author, department, title, date_questions])

        return entries
                

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")



In [None]:
entries = scrape_bulletin(url_b125)
# entries

In [None]:
entries

In [None]:
# def scrape_bulletin(url):
#     response = requests.get(url)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         table_containers = soup.find_all('table', width="100%")

#         entries = []
        
#         for table_container in table_containers:
#             print("***", table_container)
#             tr_elements = table_container.find_all('tr')
#             # print(tr_elements)

#     #         # Initialize variables for additional information
#     #         author, department, title, date_questions, answer_published = "N/A", "N/A", "N/A", "N/A", "N/A"

#     #         for tr_element in tr_elements:
#     #             td_elements = tr_element.find_all('td', class_='txt')

#     #             if len(td_elements) == 2:
#     #             # if len(tr_element) == 2:
#     #                 label, value = td_element[0].i.text.strip(), td_element[1].text.strip()

#     #                 if "Auteur" in label:
#     #                     author = value
#     #                 elif "Departement" in label:
#     #                     department = value
#     #                 elif "Titel" in label:
#     #                     title = value
#     #                 elif "Datum indiening" in label:
#     #                     date_match = re.search(r'\d{2}/\d{2}/\d{4}', value)
#     #                     date_questions = date_match.group(0) if date_match else "N/A"

#     #                 elif "Antwoord gepubliceerd" in label:
#     #                     answer_url_match = re.search(r'href="([^"]+)"', value)
#     #                     answer_published = answer_url_match.group(1) if answer_url_match else "N/A"

#     #         entries.append([author, department, title, date_questions, answer_published])


#     #         if not author == department == title == date_questions == 'N/A':
#     #             entries.append([author, department, title, date_questions, answer_published])

#     #     return entries
                
#     # else:
#     #     print(f"Failed to retrieve the page. Status code: {response.status_code}")



In [None]:
def scrape_bulletin(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        table_containers = soup.find_all('table')

        print("*********", table_containers)

        # entries = []
        
        # for table_container in table_containers:
        #     print("***", table_container)
        #     tr_elements = table_container.find_all('tr')
            # print(tr_elements)

    #         # Initialize variables for additional information
    #         author, department, title, date_questions, answer_published = "N/A", "N/A", "N/A", "N/A", "N/A"

    #         for tr_element in tr_elements:
    #             td_elements = tr_element.find_all('td', class_='txt')

    #             if len(td_elements) == 2:
    #             # if len(tr_element) == 2:
    #                 label, value = td_element[0].i.text.strip(), td_element[1].text.strip()

    #                 if "Auteur" in label:
    #                     author = value
    #                 elif "Departement" in label:
    #                     department = value
    #                 elif "Titel" in label:
    #                     title = value
    #                 elif "Datum indiening" in label:
    #                     date_match = re.search(r'\d{2}/\d{2}/\d{4}', value)
    #                     date_questions = date_match.group(0) if date_match else "N/A"

    #                 elif "Antwoord gepubliceerd" in label:
    #                     answer_url_match = re.search(r'href="([^"]+)"', value)
    #                     answer_published = answer_url_match.group(1) if answer_url_match else "N/A"

    #         entries.append([author, department, title, date_questions, answer_published])


    #         if not author == department == title == date_questions == 'N/A':
    #             entries.append([author, department, title, date_questions, answer_published])

    #     return entries
                
    # else:
    #     print(f"Failed to retrieve the page. Status code: {response.status_code}")



In [None]:
entries

In [None]:
entries = scrape_bulletin(url_b125)
# entries

In [None]:
len(entries)

In [None]:
def scrape_bulletin(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        html_code = response.text
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all elements that start with "<td valign="middle">"
        td_elements = soup.find_all('td', {'valign': 'middle'})
        
        # Iterate through each <td> element
        for i in range(len(td_elements) - 1):
            start_index = html_code.find(str(td_elements[i]))
            end_index = html_code.find(str(td_elements[i + 1]))
        
            # Extract the HTML content between start_index and end_index
            extracted_html = html_code[start_index:end_index]
        
            # Print or further process the extracted HTML content
            print(extracted_html)

            print("*********")


In [None]:
def scrape_bulletin(url):
    response = requests.get(url)
    
    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')


        # Find all the tables with class 'txt'
        tables = soup.find_all('table', class_='txt')
        
        # Loop through each table and extract the required information
        for index, table in enumerate(tables):
            # Extract data from the table
            row_data = table.find_all('tr')
            
            # Extract specific elements from each row
            for row in row_data:
                # Extract the text inside the <td> tags
                cells = row.find_all('td')
                # cells = row.find_all('td', class_='txt')

                # cell_texts = [cell.get_text(strip=True) for cell in cells]
                # cell_texts = [text.replace('\xa0', ' ') for text in cell_texts]  # Replace non-breaking space with regular space
                
                # Print the extracted data
                print("".join(cells))
            
            # Add asterisks to indicate the end of each section
            print('*' * 100)


In [None]:
scrape_bulletin(url_b125)

In [None]:


def extract_information_from_url(url):
    # Fetch HTML content from the provided URL
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to fetch the page. Please check the URL.")
        return

    html_code = response.text
    soup = BeautifulSoup(html_code, 'html.parser')

    # Extracting information from each entry
    tables = soup.find_all('table', class_='txt')

    for table in tables:
        rows = table.find_all('tr')

        # Extracting information from each row
        for row in rows:
            # Check if the row contains relevant information
            if row.find('i', text='Auteur'):
                author = row.find('td', class_='txt').get_text(strip=True)
                department = row.find_next('tr').find('td', class_='txt').get_text(strip=True)
                title = row.find_next('tr').find('td', class_='txt').get_text(strip=True)
                date_submission = row.find('i', text='Datum indiening').find_next('td', class_='txt').get_text(strip=True)
                answer_published_element = row.find('i', text='Antwoord gepubliceerd')
                answer_published = answer_published_element.find_next('a').get('href') if answer_published_element else "N/A"

                # Print or use the extracted information as needed
                print("Auteur:", author)
                print("Departement:", department)
                print("Titel:", title)
                print("Datum indiening:", date_submission)
                print("Antwoord gepubliceerd:", answer_published)
                print("\n")

# Example usage with a URL
url = "https://example.com/your_page"
extract_information_from_url(url)


In [None]:
# def parse_legislative_questions(url):
#     # Send a GET request to the URL
#     response = requests.get(url)
    
#     # Check if the request was successful (status code 200)
#     if response.status_code == 200:
#         # Parse the HTML content using BeautifulSoup
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         # Find the h4 element with the specific text as an anchor
#         # anchor_h4 = soup.find('h4', text='Schriftelijke vragen en antwoorden Nr B125 - Zittingsperiode : 55')
#         anchor_h4 = soup.find('h4', text='Schriftelijke vragen en antwoorden Nr B125 - Zittingsperiode : 55')
        
#         if anchor_h4:
#             # Find the parent element of the anchor_h4 to locate the container
#             container_div = anchor_h4.find_parent('div')
            
#             # Find all table rows (tr) within the container_div
#             legislative_items = container_div.find_all('tr')
            
#             # Iterate through each legislative item and extract information
#             for item in legislative_items:
#                 # Extract reference number (B125) from the link
#                 reference_number = item.find('a', {'href': lambda x: x and 'dossierID' in x}).text.strip()
                
#                 # Extract author, department, title, and date information
#                 author = item.find('td', {'class': 'txt', 'width': '150'}).find_next('td', {'class': 'txt'}).text.strip()
#                 department = item.find('i', text='Departement').find_next('td', {'class': 'txt'}).text.strip()
#                 title = item.find('i', text='Titel').find_next('td', {'class': 'txt'}).text.strip()
#                 date = item.find('i', text='Datum indiening').find_next('td', {'class': 'txt'}).text.strip()
                
#                 # Extract the link to the published answer
#                 answer_link = item.find('a', {'target': '_blank'}).get('href')
                
#                 # Print or store the extracted information
#                 print(f"Reference Number: {reference_number}")
#                 print(f"Author: {author}")
#                 print(f"Department: {department}")
#                 print(f"Title: {title}")
#                 print(f"Date of Submission: {date}")
#                 print(f"Answer Link: {answer_link}")
#                 print("------------------------------")
#         else:
#             print("Error: Anchor h4 not found.")

#     else:
#         print(f"Error: Unable to fetch the page. Status code: {response.status_code}")



In [None]:
# Example usage with a sample URL
parse_legislative_questions(url_b125)

In [None]:
# def main(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')

#     destination_folder = "pdfs"
#     os.makedirs(destination_folder, exist_ok=True)

#     question_data = []

#     for link in soup.find_all('a', href=True):
#         if link['href'].startswith('showpage.cfm?section=qrva&language=nl&cfm=qrvaXml.cfm?legislat='):
#             question_url = f"https://www.dekamer.be/kvvcr/{link['href']}"
#             question_response = requests.get(question_url)
#             question_soup = BeautifulSoup(question_response.text, 'html.parser')
            
#             question_info = extract_question_info(question_soup)
#             question_data.append(question_info)

#     # Create a DataFrame from the extracted data
#     df = pd.DataFrame(question_data)

#     # # Save the DataFrame to a CSV file
#     # df.to_csv('question_data.csv', index=False)


In [None]:
url_b125 = "https://www.dekamer.be/kvvcr/showpage.cfm?&language=nl&cfm=/site/wwwcfm/qrva/qrvatoc.cfm?legislat=55&bulletin=B125"

In [None]:
def download_pdf(pdf_url, destination_folder):
    response = requests.get(pdf_url)
    filename = os.path.join(destination_folder, os.path.basename(pdf_url))
    with open(filename, 'wb') as pdf_file:
        pdf_file.write(response.content)
    return filename

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    doc.close()
    return text

def main():
    url = "https://www.dekamer.be/kvvcr/showpage.cfm?section=/qrva&language=nl&cfm=qrvaList.cfm"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    destination_folder = "pdfs"
    os.makedirs(destination_folder, exist_ok=True)

    pdf_data = []

    for link in soup.find_all('a', href=True):
        if link['href'].endswith('.pdf'):
            pdf_url = link['href']
            pdf_path = download_pdf(pdf_url, destination_folder)
            text_content = extract_text_from_pdf(pdf_path)

            pdf_data.append({
                'PDF_URL': pdf_url,
                'TEXT_CONTENT': text_content
            })

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(pdf_data)

    # Save the DataFrame to a CSV file
    df.to_csv('pdf_data.csv', index=False)
