# Setting up

In [1]:
# show all outputs of cell, not merely of last line (i.e. default of Jupyter Notebook)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# import os
import requests
from bs4 import BeautifulSoup

import pandas as pd

import re
import pickle

from PyPDF2 import PdfReader
import wget

In [3]:
# Define url to pdf on composition of parliament
url_fractions = "https://www.dekamer.be/doc/FLWB/PDF/55/0003/55K0003029.pdf"

# location to store pdf
download_location_pdf = "../data/members_fractions.pdf"
download_location_txt_original = "../data/members_fractions(original).txt"
download_location_txt_modified = "../data/members_fractions(modified).txt"

In [4]:
def download_pdf(url, destination):
    # Download the PDF file from the URL
    filename = wget.download(url, out=destination)
    return filename

In [5]:
path_pdf_members = download_pdf(url_fractions, download_location_pdf)
path_pdf_members

'../data/members_fractions (1).pdf'

In [6]:
def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            text += page.extract_text()
    return text

In [7]:
# Example usage
pdf_text = read_pdf(path_pdf_members)


# Write the text to a text file
with open(download_location_txt_original, "w") as output_file:
    output_file.write(pdf_text)


10960

We then store the string in a text file and manually remove the big chunks of text that are irrelevant.  Subsequently we read in the modified file and do some cleaning. 

In [8]:
# Write the text to a text file
with open(download_location_txt_modified, "r") as file:
    members_txt = file.read()

In [9]:
# Inspect results
members_txt

"N-VA\n24 membres - 24 leden\nPrésident M. - De heer Peter De Roover Voorzitter\nAnseeuw  (Björn-R.F.G.) N West-Vlaanderen\nBuysrogge  (Peter-A.M.) N Oost-Vlaanderen\nClaes (Mieke) N Limburg\nDepoorter  (Kathleen-M.L.) N Oost-Vlaanderen\nDe Roover  (Peter-M.A.) N Antwerpen\nDe Wit (Sophie-M.R.) N Antwerpen\nD'Haese (Christoph-E.) N Oost-Vlaanderen\nFrancken  (Theo-J.E.) N Vlaams-Brabant\nFreilich (Michael) N Antwerpen\nGijbels (Frieda) N Limburg\nGoethals  (Sigrid-H.) N Vlaams-Brabant\nHoutmeyers  (Katrien) N Vlaams-Brabant\nIngels (Yngvild-I.Y.) N West-Vlaanderen\nLoones (Sander-C.) N West-Vlaanderen\nMetsu (Koen) N Antwerpen\nRaskin (Wouter) N Limburg\nRoggeman  (Tomas-J.S.N.) N Oost-Vlaanderen\nSafai (Darya) N Vlaams-Brabant\nVan Bossuyt  (Anneleen-C.) N Oost-Vlaanderen\nVan Camp  (Yoleen-P.M.) N Antwerpen\nVan der Donckt  (Wim) (*) N Antwerpen\nVan Peel (Valerie-G.K.) N Antwerpen\nVan Vaerenbergh  (Kristien) N Vlaams-Brabant2DOC 55 0003/029\nWollants (Bert) N Antwerpen\nEcolo-Groen

Then we clean up this data:
* There appears to be a footnote that is inserted after the text of each page. So we remove this.
* When extracting and rearranging the first and last name of the members of parliament, splitting at dashes is convenient. However, many of the names of the provinces contain dashes, so we remove them.  

In [10]:
# # remove footnote
# members_txt = members_txt.replace('DOC 55 0003/029', "")

# # # Inspect results
# # print(members_txt)

In [11]:
# # Remove provinces and language indicators
# elements_to_delete = [
#     # Provinces
#     "West-Vlaanderen", "Oost-Vlaanderen", "Antwerpen", "Limburg", "Vlaams-Brabant",
#     "Liège", "Namur", "Luxembourg", "Hainaut", "Brabant Wallon",
#     "Bruxelles-Capitale", "Brussel-Hoofdstad",
#     # Language indicators
#     ' N ', ' F '
# ]

# # Iterate through each string in the list
# for i in range(len(lines)):
#     # Iterate through each part to remove
#     for part in elements_to_delete:
#         # Remove the part if it occurs in the string
#         lines[i] = lines[i].replace(part, "")

# print(lines)

# # for element in elements_to_delete:
# #     members_txt.replace(element, "")
# # print(members_txt)

The lines relating tot he parties, can be relevant to structure the members in dictionaries with the parties as keys, but other elements are not relevant. Hence, they can be removed. 

In [12]:

elements_to_skip = ["NOMS", "Membres de la Chambre n'appartenant à aucun groupe politique", 
               "Leden van de Kamer die tot geen enkele politieke fractie behoren",
               "Président"]

# Initialize an empty list to store lines
lines = []

# # Open the file in read mode
# with open(download_location_txt_modified, "r") as file:
#     # Read the file line by line
#     for line in file:
#         # Check if the line starts with any elements to skip
#         if not any(line.strip().startswith(prefix) for prefix in elements_to_skip):
#             # If the line doesn't start with any elements to skip, add it to the list
#             lines.append(line.strip())

# Open the file in read mode
with open(download_location_txt_modified, "r") as file:
    # Read the file line by line
    for line in file:
        # Check if the line starts with a digit or with elements in the pre-specified list
        if not (line.strip().startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) or
                any(line.strip().startswith(prefix) for prefix in elements_to_skip)):
            # If the line doesn't start with a digit or with elements in the pre-specified list, add it to the list
            lines.append(line.strip())

# Inspect results
lines

['N-VA',
 'Anseeuw  (Björn-R.F.G.) N West-Vlaanderen',
 'Buysrogge  (Peter-A.M.) N Oost-Vlaanderen',
 'Claes (Mieke) N Limburg',
 'Depoorter  (Kathleen-M.L.) N Oost-Vlaanderen',
 'De Roover  (Peter-M.A.) N Antwerpen',
 'De Wit (Sophie-M.R.) N Antwerpen',
 "D'Haese (Christoph-E.) N Oost-Vlaanderen",
 'Francken  (Theo-J.E.) N Vlaams-Brabant',
 'Freilich (Michael) N Antwerpen',
 'Gijbels (Frieda) N Limburg',
 'Goethals  (Sigrid-H.) N Vlaams-Brabant',
 'Houtmeyers  (Katrien) N Vlaams-Brabant',
 'Ingels (Yngvild-I.Y.) N West-Vlaanderen',
 'Loones (Sander-C.) N West-Vlaanderen',
 'Metsu (Koen) N Antwerpen',
 'Raskin (Wouter) N Limburg',
 'Roggeman  (Tomas-J.S.N.) N Oost-Vlaanderen',
 'Safai (Darya) N Vlaams-Brabant',
 'Van Bossuyt  (Anneleen-C.) N Oost-Vlaanderen',
 'Van Camp  (Yoleen-P.M.) N Antwerpen',
 'Van der Donckt  (Wim) (*) N Antwerpen',
 'Van Peel (Valerie-G.K.) N Antwerpen',
 'Van Vaerenbergh  (Kristien) N Vlaams-Brabant2DOC 55 0003/029',
 'Wollants (Bert) N Antwerpen',
 'Ecolo-Gro

In [13]:
# There seem to be 2 mentionings of the key 'INDEPENDANTS - ONAFHANKELIJKE', which will cause errors, so we remove the latter one.  
del lines[-2]
lines[-5:]

['Rohonyi (Sophie-C.Z.)F',
 'F Bruxelles-Capitale',
 'INDEPENDANTS - ONAFHANKELIJKE',
 'Dedecker (Jean-Marie-L.) N West-Vlaanderen',
 'Kir (Emir-E.) F Bruxelles-CapitaleDOC 55 0003/0297']

In [14]:
# parties = ['N-VA', 'Ecolo-Groen', 'PS', 'VB', 'MR', 'cd&v', 'PVDA-PTB', 
#            'Open Vld', 'Vooruit', 'Les Engagés', 'DéFI', 
#            'INDEPENDANTS - ONAFHANKELIJKE']

In [15]:
# elements_to_delete = [
#     # Provinces
#     "West-Vlaanderen", "Oost-Vlaanderen", "Antwerpen", "Limburg", "Vlaams-Brabant",
#     "Liège", "Namur", "Luxembourg", "Hainaut", "Brabant Wallon",
#     "Bruxelles-Capitale", "Brussel-Hoofdstad",
#     # Language roll
#     "N ", "F ",

#     # other
#     "Voorzitster", "Présidente"
    
                      
#                      ]

Then we remove the provinces, since many of them contain dashes that would interfere with the name splitting later on. 

In [16]:
elements_to_delete = [
    # Provinces
    "West-Vlaanderen", "Oost-Vlaanderen", "Antwerpen", "Limburg", "Vlaams-Brabant",
    "Liège", "Namur", "Luxembourg", "Hainaut", "Brabant Wallon",
    "Bruxelles-Capitale", "Brussel-Hoofdstad",
    # Language indicators
    ' N ', ' F '
]

# Iterate through each string in the list
for i in range(len(lines)):
    # Iterate through each part to remove
    for part in elements_to_delete:
        # Remove the part if it occurs in the string
        lines[i] = lines[i].replace(part, "")

print(lines)

['N-VA', 'Anseeuw  (Björn-R.F.G.)', 'Buysrogge  (Peter-A.M.)', 'Claes (Mieke)', 'Depoorter  (Kathleen-M.L.)', 'De Roover  (Peter-M.A.)', 'De Wit (Sophie-M.R.)', "D'Haese (Christoph-E.)", 'Francken  (Theo-J.E.)', 'Freilich (Michael)', 'Gijbels (Frieda)', 'Goethals  (Sigrid-H.)', 'Houtmeyers  (Katrien)', 'Ingels (Yngvild-I.Y.)', 'Loones (Sander-C.)', 'Metsu (Koen)', 'Raskin (Wouter)', 'Roggeman  (Tomas-J.S.N.)', 'Safai (Darya)', 'Van Bossuyt  (Anneleen-C.)', 'Van Camp  (Yoleen-P.M.)', 'Van der Donckt  (Wim) (*)', 'Van Peel (Valerie-G.K.)', 'Van Vaerenbergh  (Kristien)2DOC 55 0003/029', 'Wollants (Bert)', 'Ecolo-Groen', 'Buyst (Kim)', 'Calvo (Kristof)', 'Chanson  (Julie-C.J.G.)', 'Cogolati (Samuel-M.M.-L.)', 'Cornet (Cécile-M.) (*)', 'Creemers  (Barbara)', 'Defossé (Guillaume-R.V.G.) (*)', 'de Laveleye  (Séverine-F.)', 'De Vriendt  (Wouter-J.A.)', 'Hennuy (Laurence-C.)', 'Hugon (Claire-A.M.M.) (*)', 'Mariage (Louis-C.S.)', 'Moutquin  (Simon-F)', 'Pisman (Kathleen-M.C.)', 'Platteau (Eva-A.

In [17]:
# for line in lines:
#     lines

Then we create dictionaries using the parties as keys. 

In [18]:
parties = ['N-VA', 'Ecolo-Groen', 'PS', 'VB', 'MR', 'cd&v', 'PVDA-PTB', 
           'Open Vld', 'Vooruit', 'Les Engagés', 'DéFI', 
           'INDEPENDANTS - ONAFHANKELIJKE']

members_dict = {}
current_key = None

for item in lines:
    if item in parties:
        current_key = item
        members_dict[current_key] = []
    else:
        members_dict[current_key].append(item)

print(members_dict)
members_dict.keys()

{'N-VA': ['Anseeuw  (Björn-R.F.G.)', 'Buysrogge  (Peter-A.M.)', 'Claes (Mieke)', 'Depoorter  (Kathleen-M.L.)', 'De Roover  (Peter-M.A.)', 'De Wit (Sophie-M.R.)', "D'Haese (Christoph-E.)", 'Francken  (Theo-J.E.)', 'Freilich (Michael)', 'Gijbels (Frieda)', 'Goethals  (Sigrid-H.)', 'Houtmeyers  (Katrien)', 'Ingels (Yngvild-I.Y.)', 'Loones (Sander-C.)', 'Metsu (Koen)', 'Raskin (Wouter)', 'Roggeman  (Tomas-J.S.N.)', 'Safai (Darya)', 'Van Bossuyt  (Anneleen-C.)', 'Van Camp  (Yoleen-P.M.)', 'Van der Donckt  (Wim) (*)', 'Van Peel (Valerie-G.K.)', 'Van Vaerenbergh  (Kristien)2DOC 55 0003/029', 'Wollants (Bert)'], 'Ecolo-Groen': ['Buyst (Kim)', 'Calvo (Kristof)', 'Chanson  (Julie-C.J.G.)', 'Cogolati (Samuel-M.M.-L.)', 'Cornet (Cécile-M.) (*)', 'Creemers  (Barbara)', 'Defossé (Guillaume-R.V.G.) (*)', 'de Laveleye  (Séverine-F.)', 'De Vriendt  (Wouter-J.A.)', 'Hennuy (Laurence-C.)', 'Hugon (Claire-A.M.M.) (*)', 'Mariage (Louis-C.S.)', 'Moutquin  (Simon-F)', 'Pisman (Kathleen-M.C.)', 'Platteau (Eva

dict_keys(['N-VA', 'Ecolo-Groen', 'PS', 'VB', 'MR', 'cd&v', 'PVDA-PTB', 'Open Vld', 'Vooruit', 'Les Engagés', 'DéFI', 'INDEPENDANTS - ONAFHANKELIJKE'])

Subsequently, we extract the first and last names of the members, and arrange them accordingly. 

In [19]:
# Extract names
for key in members_dict.keys():
    
    rearranged_data = []
    
    for item in members_dict[key]:
        # print(item)
        # Check if there is a dash in the text to indicate the initials of the second and or third names of the members 
        if '-' in item:
            # Split the text at a dash followed by either one letter and a dot or two letters and a dot
            parts = re.split(r'-[A-Z]\.|-[A-Z]{2}\.', item)
        else:
            # Split the text at ')'
            parts = item.split(')')
        
        # # Split at '(' and '-'
        # # parts = item.split('-')[0].split('(')
        # # Split the text at a dash followed by either one letter and a dot or two letters and a dot
        # parts = re.split(r'-[A-Z]\.|-[A-Z]{2}\.', item)
        # print(parts)
        # Split the splitted part including the names at the bracket
        parts = parts[0].split("(")
        # print(parts)
        # Rearrange and strip each part, then join with a space
        rearranged_item = ' '.join(part.strip() for part in reversed(parts))
        rearranged_data.append(rearranged_item)

    members_dict[key] = rearranged_data

Then we assess the results and perform some manual corrections. 

In [20]:
# Assess results
members_dict

# Calculate the accumulated length using a list comprehension and sum()
sum(len(value) for value in members_dict.values())

{'N-VA': ['Björn Anseeuw',
  'Peter Buysrogge',
  'Mieke Claes',
  'Kathleen Depoorter',
  'Peter De Roover',
  'Sophie De Wit',
  "Christoph D'Haese",
  'Theo Francken',
  'Michael Freilich',
  'Frieda Gijbels',
  'Sigrid Goethals',
  'Katrien Houtmeyers',
  'Yngvild Ingels',
  'Sander Loones',
  'Koen Metsu',
  'Wouter Raskin',
  'Tomas Roggeman',
  'Darya Safai',
  'Anneleen Van Bossuyt',
  'Yoleen Van Camp',
  'Wim Van der Donckt',
  'Valerie Van Peel',
  'Kristien Van Vaerenbergh',
  'Bert Wollants'],
 'Ecolo-Groen': ['Kim Buyst',
  'Kristof Calvo',
  'Julie Chanson',
  'Samuel Cogolati',
  'Cécile Cornet',
  'Barbara Creemers',
  'Guillaume Defossé',
  'Séverine de Laveleye',
  'Wouter De Vriendt',
  'Laurence Hennuy',
  'Claire Hugon',
  'Louis Mariage',
  'Simon-F) Moutquin',
  'Kathleen Pisman',
  'Eva Platteau',
  'Sarah Schlitz',
  'Olivier Vajda',
  'Dieter Vanbesien',
  'Gilles Vanden Burre',
  'Stefaan Van Hecke',
  'Albert Vicaire'],
 'PS': ['Khalil Aouasti',
  'Hugues B

151

In [21]:
# Modify results manually
index = members_dict['Ecolo-Groen'].index('Simon-F) Moutquin')
members_dict['Ecolo-Groen'][index] = 'Simon Moutquin'

index = members_dict['cd&v'].index('Leen-Ch.R.) Dierick')
members_dict['cd&v'][index] = 'Leen Dierick'

members_dict['DéFI'].remove('F')

In [22]:
# Assess results
members_dict

# Calculate the accumulated length using a list comprehension and sum()
sum(len(value) for value in members_dict.values())

{'N-VA': ['Björn Anseeuw',
  'Peter Buysrogge',
  'Mieke Claes',
  'Kathleen Depoorter',
  'Peter De Roover',
  'Sophie De Wit',
  "Christoph D'Haese",
  'Theo Francken',
  'Michael Freilich',
  'Frieda Gijbels',
  'Sigrid Goethals',
  'Katrien Houtmeyers',
  'Yngvild Ingels',
  'Sander Loones',
  'Koen Metsu',
  'Wouter Raskin',
  'Tomas Roggeman',
  'Darya Safai',
  'Anneleen Van Bossuyt',
  'Yoleen Van Camp',
  'Wim Van der Donckt',
  'Valerie Van Peel',
  'Kristien Van Vaerenbergh',
  'Bert Wollants'],
 'Ecolo-Groen': ['Kim Buyst',
  'Kristof Calvo',
  'Julie Chanson',
  'Samuel Cogolati',
  'Cécile Cornet',
  'Barbara Creemers',
  'Guillaume Defossé',
  'Séverine de Laveleye',
  'Wouter De Vriendt',
  'Laurence Hennuy',
  'Claire Hugon',
  'Louis Mariage',
  'Simon Moutquin',
  'Kathleen Pisman',
  'Eva Platteau',
  'Sarah Schlitz',
  'Olivier Vajda',
  'Dieter Vanbesien',
  'Gilles Vanden Burre',
  'Stefaan Van Hecke',
  'Albert Vicaire'],
 'PS': ['Khalil Aouasti',
  'Hugues Baye

150

In [26]:
# Modify the name of some of parties to ease further handling
members_dict['Vlaams Belang'] = members_dict.pop('VB')
members_dict['Onafhankelijk'] = members_dict.pop('INDEPENDANTS - ONAFHANKELIJKE')

In [27]:
# Assess results
members_dict

{'N-VA': ['Björn Anseeuw',
  'Peter Buysrogge',
  'Mieke Claes',
  'Kathleen Depoorter',
  'Peter De Roover',
  'Sophie De Wit',
  "Christoph D'Haese",
  'Theo Francken',
  'Michael Freilich',
  'Frieda Gijbels',
  'Sigrid Goethals',
  'Katrien Houtmeyers',
  'Yngvild Ingels',
  'Sander Loones',
  'Koen Metsu',
  'Wouter Raskin',
  'Tomas Roggeman',
  'Darya Safai',
  'Anneleen Van Bossuyt',
  'Yoleen Van Camp',
  'Wim Van der Donckt',
  'Valerie Van Peel',
  'Kristien Van Vaerenbergh',
  'Bert Wollants'],
 'Ecolo-Groen': ['Kim Buyst',
  'Kristof Calvo',
  'Julie Chanson',
  'Samuel Cogolati',
  'Cécile Cornet',
  'Barbara Creemers',
  'Guillaume Defossé',
  'Séverine de Laveleye',
  'Wouter De Vriendt',
  'Laurence Hennuy',
  'Claire Hugon',
  'Louis Mariage',
  'Simon Moutquin',
  'Kathleen Pisman',
  'Eva Platteau',
  'Sarah Schlitz',
  'Olivier Vajda',
  'Dieter Vanbesien',
  'Gilles Vanden Burre',
  'Stefaan Van Hecke',
  'Albert Vicaire'],
 'PS': ['Khalil Aouasti',
  'Hugues Baye

In [28]:
## Save members_dict for later use
# 1. Save as pkl
with open('../data/members_dict.pkl', 'wb') as file:
    pickle.dump(members_dict, file)
