The website of the Belgian federal parliament also documents the oral questions members of parliament ask (both in [commissions](https://www.dekamer.be/kvvcr/showpage.cfm?section=/cricra&language=nl&cfm=dcricra.cfm?type=comm&cricra=cri&count=all) and in the plenary). There, an overview of oral questions and motions is available, both in pdf, pda and html format. In some cases, there is only a pdf version available. However, closer inspection of those cases indicates that the pdf simply indicates that the questions were postponed. So in all cases where questions were actually discussed, there is an html version. 

# Setting up

In [1]:
# show all outputs of cell, not merely of last line (i.e. default of Jupyter Notebook)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# import os
import requests
from bs4 import BeautifulSoup

import pandas as pd

import re
import pickle

In [3]:
# Define url of url with overview of meetings
url_oral_questions_all_meetings = "https://www.dekamer.be/kvvcr/showpage.cfm?section=/cricra&language=nl&cfm=dcricra.cfm?type=comm&cricra=cri&count=all"

# Members of parliament

First we obtain a list of all members of parliament and their parties, as avaiable on the [website](https://www.dekamer.be/kvvcr/showpage.cfm?section=/depute&language=nl&cfm=/site/wwwcfm/depute/cvlist54.cfm) of parliament.

In [4]:
# url_members = "https://www.dekamer.be/kvvcr/showpage.cfm?section=/depute&language=nl&cfm=/site/wwwcfm/depute/cvlist54.cfm"

# response = requests.get(url_members)

# if response.status_code == 200:
#     # Parse the HTML string
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Find all <tr> tags
#     rows = soup.find_all('tr')
    
#     # Define regular expressions to extract names and parties
#     name_pattern = re.compile(r'<b>(.*?)</b>')
#     party_pattern = re.compile(r'>(.*?)</A>')
    
#     # Initialize lists to store names and parties
#     names = []
#     parties = []
    
#     # Extract names and parties from each <tr> tag
#     for row in rows:
#         # Find the <b> tag within the <td> tag to get the name
#         name_tag = row.find('b')
#         if name_tag:
#             names.append(name_tag.get_text(strip=True))
        
#         # Find the text within the <td> tag to get the party
#         party_tag = row.find('td', class_='td1' if 'td1' in row['class'] else 'td0')
#         if party_tag:
#             party = party_tag.get_text(strip=True)
#             # Remove any whitespace around the party name
#             party = party.strip()
#             parties.append(party)
    
#     # Combine names and parties into a list of tuples
#     members_parties = list(zip(names, parties))
    
# else:
#     print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [5]:
# members_parties

# Extracting urls of different reports of every meeting

In [6]:
# def scrape_list_reports(url):
#     response = requests.get(url)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Find all <a> tags with the title 'Kopieervriendelijke HTML versie'
#         links = soup.find_all('a', title='Kopieervriendelijke HTML versie')
        
#         # Extract URLs from the links and append to main stem to get workable url
#         urls = ["https://www.dekamer.be" + link['href'] for link in links]
        
#         return urls

#     else:
#         print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [7]:
# # Obtain relevant urls
# urls_all_meetings = scrape_list_reports(url_oral_questions_all_meetings)

In [8]:
# # Assess results
# urls_all_meetings[:10]
# urls_all_meetings[-10:]
# len(urls_all_meetings)

In [9]:
# # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # limit amounf of pages for testing
# urls_all_meetings = urls_all_meetings[:3]
# urls_all_meetings
# # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Obtain date and commission

In [10]:
# def extract_info_from_html(html_content):
#     soup = BeautifulSoup(html_content, 'html.parser')
#     table_rows = soup.find_all('tr')

#     for row in table_rows:
#         cells = row.find_all('td')
#         if cells:
#             date = cells[2].get_text(strip=True)
#             # link = cells[3].find('a', title="Kopieervriendelijke HTML versie")["href"]
#             html_link = cells[3].find('a', title="Kopieervriendelijke HTML versie")
#             if html_link:
#                 link = html_link['href']
#             else:
#                 link = None
#             committee = cells[5].get_text(strip=True)
#             yield date, link, committee

In [11]:
def extract_info_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table_rows = soup.find_all('tr')

    for row in table_rows:
        cells = row.find_all('td')
        if cells:
            html_link = cells[3].find('a', title="Kopieervriendelijke HTML versie")
            # For some meetings there is no html page available, but there is a pdf. 
            # However, this pdf simply indicates that the questions were postponed. 
            # So then this row can be skipped.
            if html_link:
                date = cells[2].get_text(strip=True)
                link = html_link['href']
                committee = cells[5].get_text(strip=True)
                yield date, link, committee


In [12]:
def extract_info_from_hyperlink(url):
    response = requests.get(url)
    if response.status_code == 200:
        return extract_info_from_html(response.content)
    else:
        print("Failed to fetch HTML content from the provided URL.")
        return None

In [13]:
# Extract information from the provided hyperlink
info_generator = extract_info_from_hyperlink(url_oral_questions_all_meetings)

meeting_information_list = []

if info_generator:
    for date, html_url, committee in info_generator:
        # print("Date:", date)
        # print("Link:", link)
        # print("Committee:", committee)

        meeting_information_list.append([date, "https://www.dekamer.be" + html_url, committee])
else:
    print("Exiting...")



In [14]:
# meeting_information_list

In [15]:
# for i in info_generator:
#     print(i)

In [16]:

# meeting_information_list = []
# if info_generator:
    
#     meeting_information_list.append([info_generator])



In [17]:
# Turn list into dataframe
meeting_information_df = pd.DataFrame(meeting_information_list,
                                 columns = ["Datum", "Url", "Commissie"])



In [18]:
# Inspect results
meeting_information_df.head()
meeting_information_df.tail()

meeting_information_df["Commissie"].value_counts()

# Inspect date
meeting_information_df["Datum"].min()
meeting_information_df["Datum"].max()
meeting_information_df.dtypes


Unnamed: 0,Datum,Url,Commissie
0,21 februari 2024,https://www.dekamer.be/doc/CCRI/html/55/ic1276...,"Commissie voor Sociale Zaken, Werk en Pensioenen"
1,21 februari 2024,https://www.dekamer.be/doc/CCRI/html/55/ic1275...,Commissie voor Buitenlandse Betrekkingen
2,20 februari 2024,https://www.dekamer.be/doc/CCRI/html/55/ic1274...,"Commissie voor Binnenlandse Zaken, Veiligheid,..."
3,20 februari 2024,https://www.dekamer.be/doc/CCRI/html/55/ic1273...,"Commissie voor Energie, Leefmilieu en Klimaat"
4,20 februari 2024,https://www.dekamer.be/doc/CCRI/html/55/ic1272...,Commissie voor Buitenlandse Betrekkingen


Unnamed: 0,Datum,Url,Commissie
1240,30 augustus 2019,https://www.dekamer.be/doc/CCRI/html/55/ic005x...,Commissie voor Buitenlandse Betrekkingen
1241,18 juli 2019,https://www.dekamer.be/doc/CCRI/html/55/ic004x...,Commissie voor Landsverdediging
1242,17 juli 2019,https://www.dekamer.be/doc/CCRI/html/55/ic003x...,Commissie voor Justitie
1243,16 juli 2019,https://www.dekamer.be/doc/CCRI/html/55/ic002x...,"Commissie voor Binnenlandse Zaken, Veiligheid,..."
1244,15 juli 2019,https://www.dekamer.be/doc/CCRI/html/55/ic001x...,"Commissie voor Energie, Leefmilieu en Klimaat"


Commissie
Commissie voor Binnenlandse Zaken, Veiligheid, Migratie en Bestuurszaken                                                                                                                                           186
Commissie voor Gezondheid en Gelijke kansen                                                                                                                                                                        171
Commissie voor Sociale Zaken, Werk en Pensioenen                                                                                                                                                                   130
Commissie voor Justitie                                                                                                                                                                                            129
Commissie voor Mobiliteit, Overheidsbedrijven en Federale Instellingen                                                            

'1 december 2020'

'9 oktober 2019'

Datum        object
Url          object
Commissie    object
dtype: object

In [19]:
# The dates are just in string format, so we explicitly cast them to datetime, taking into account they are provided in Dutch

# Define a dictionary to map Dutch month names to English
# using pd.to_datetime directly yields errors
dutch_to_english_month = {
    'januari': 'January',
    'februari': 'February',
    'maart': 'March',
    'april': 'April',
    'mei': 'May',
    'juni': 'June',
    'juli': 'July',
    'augustus': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'december': 'December'
}

# Replace Dutch month names with English
meeting_information_df['Datum'] = meeting_information_df['Datum'].replace(dutch_to_english_month, regex=True)

# Convert the 'Datum' column to datetime
meeting_information_df['Datum'] = pd.to_datetime(meeting_information_df['Datum'], format='%d %B %Y')

# Maintain only date elements, drop time elements
meeting_information_df['Datum'] = meeting_information_df['Datum'].dt.date

In [20]:
# Inspect date
meeting_information_df.head()
meeting_information_df.tail()

meeting_information_df["Datum"].min()
meeting_information_df["Datum"].max()
meeting_information_df.dtypes

Unnamed: 0,Datum,Url,Commissie
0,2024-02-21,https://www.dekamer.be/doc/CCRI/html/55/ic1276...,"Commissie voor Sociale Zaken, Werk en Pensioenen"
1,2024-02-21,https://www.dekamer.be/doc/CCRI/html/55/ic1275...,Commissie voor Buitenlandse Betrekkingen
2,2024-02-20,https://www.dekamer.be/doc/CCRI/html/55/ic1274...,"Commissie voor Binnenlandse Zaken, Veiligheid,..."
3,2024-02-20,https://www.dekamer.be/doc/CCRI/html/55/ic1273...,"Commissie voor Energie, Leefmilieu en Klimaat"
4,2024-02-20,https://www.dekamer.be/doc/CCRI/html/55/ic1272...,Commissie voor Buitenlandse Betrekkingen


Unnamed: 0,Datum,Url,Commissie
1240,2019-08-30,https://www.dekamer.be/doc/CCRI/html/55/ic005x...,Commissie voor Buitenlandse Betrekkingen
1241,2019-07-18,https://www.dekamer.be/doc/CCRI/html/55/ic004x...,Commissie voor Landsverdediging
1242,2019-07-17,https://www.dekamer.be/doc/CCRI/html/55/ic003x...,Commissie voor Justitie
1243,2019-07-16,https://www.dekamer.be/doc/CCRI/html/55/ic002x...,"Commissie voor Binnenlandse Zaken, Veiligheid,..."
1244,2019-07-15,https://www.dekamer.be/doc/CCRI/html/55/ic001x...,"Commissie voor Energie, Leefmilieu en Klimaat"


datetime.date(2019, 7, 15)

datetime.date(2024, 2, 21)

Datum        object
Url          object
Commissie    object
dtype: object

In [21]:
# # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # limit amount of pages for testing
# meeting_information_df = meeting_information_df[:3]
# meeting_information_df
# # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Check if this matches with indicators on website

In [22]:
# ids = []
# # Iterate over each URL and extract the digits, using regex
# for url in urls_all_meetings:
#     match = re.search(r'ic(\d+)x', url)
#     if match:
#         ids.append(match.group(1))

# # Turn strings to digits
# ids = [int(x) for x in ids]

# # Sort ids from low to high
# ids = sorted(ids)

# # Assess results
# ids[:10]
# ids[-10:]

In [23]:
# ids_ints = [int(x) for x in ids]
# ids_ints[:10]

In [24]:
# urls_all_meetings[0][-10:-6]

In [25]:
# def check_consecutive_digits(lst):
#     for i in range(len(lst) - 1):
#         if lst[i] - 1 != lst[i+1]:
#             print("Irregularity starts at digit:", lst[i+1])
#             return False
#     return True

# # Example usage
# check_consecutive_digits(ids)



# Extracting names of members posing questions

First we create some helper functions to extract the relevant sections out of the reports. 
* All relevant sections containing the name of the author, the relevant minister the question is aimed at and the subject of the question are formatted as headers, i.e. `h2`. Since all repots are bilingual, those headings are always duplicated in the other language (i.e. Dutch and French). Since both entries only represent a single question, we filter on a single language, i.c. Dutch. For this we define `extract_h2_sections_from_url()`.
* We also create a function to clean those strings, `clean_h2_sections()`.
* Finally, we create a function to obtain the relevant interesting information fromt those sections using regex, i.e. the name of the author, the relevant minister the question is aimed at and the subject of the question: `extract_names_from_h2_sections()`.

In [26]:
def extract_h2_sections_from_url(url):
    # Fetch webpage content
    response = requests.get(url)
    
    # Check if request was successful
    if response.status_code != 200:
        print("Failed to fetch the webpage.")
        return []
    
    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # # Find all h2 tags
    # h2_sections = soup.find_all('h2')

    # # Extract text from all h2 tags
    # h2_sections = [h2.text for h2 in soup.find_all('h2')]

        # Find all h2 tags
    h2_tags = soup.find_all('h2')

    # Extract text from h2 tags with span lang=NL
    # The French language h2's are content-wise duplicates
    nl_h2_texts = [tag.get_text(strip=True) for tag in h2_tags if tag.find('span', lang='NL')]

    
    return nl_h2_texts

In [27]:
def clean_h2_sections(h2_sections):  
    # Clean the strings by replacing carriage returns and newlines with spaces
    h2_sections_cleaned = [string.replace('\r\n', ' ') for string in h2_sections]

    # Remove the trailing dashes at the beginning of each string
    h2_sections_cleaned = [string.lstrip('- ') for string in h2_sections_cleaned]


    # # Replace the escaped single quote with just a single quote
    # h2_sections_cleaned = [string.replace("\'", "'") for string in h2_sections_cleaned]
    
    # Remove strings starting with one or more digits followed by 'Samengevoegd'
    h2_sections_cleaned = [string for string in h2_sections_cleaned if not re.match(r'^\d+\s*Samengevoegde', string)]

    # Remove the left trailing text with '[digits]Vraag van' from each input string
    h2_sections_cleaned = [re.sub(r'^\d{2}Vraag van ', '', string) for string in h2_sections_cleaned]
    
    return h2_sections_cleaned

In [28]:
def extract_names_from_h2_sections(h2_sections):
    relevant_information = []
    
    for h2_section in h2_sections:
        # Split text with 'aan' and 'over' as seperators
        match = re.match(r'(\w+ \w+) aan (\w+ \w+) \(.*\) over "(.*)"', h2_section)
        if match:
            # append list of 'author', 'minister' and 'subject' to overall list
            relevant_information.append([match.group(1), match.group(2), match.group(3)])

    return relevant_information

The we iterate over each of the meetings (for which we obtained the urls earlier) and obtain the relevant information.

In [29]:
# relevant_information_oral_all = []

# # Iterate over each meeting
# for url_meeting in meeting_information_df["Url"]:
#     # Extract h2 sections from the webpage of the specific meeting
#     h2_sections = extract_h2_sections_from_url(url_meeting)

#     # Clean relevant sections
#     h2_sections = clean_h2_sections(h2_sections)

#     # Obtain list of author, relevant minister and subject of question
#     relevant_information_oral = extract_names_from_h2_sections(h2_sections)

#     # Add relevant information to overall list
#     relevant_information_oral_all.extend(relevant_information_oral)    

In [None]:
relevant_information_oral_all = []

# Iterate over each meeting
for index, row in meeting_information_df.iterrows():
    # Extract h2 sections from the webpage of the specific meeting
    url_meeting = row['Url']
    h2_sections = extract_h2_sections_from_url(url_meeting)

    # Clean relevant sections
    h2_sections = clean_h2_sections(h2_sections)

    # Obtain list of author, relevant minister and subject of question
    relevant_information_oral = extract_names_from_h2_sections(h2_sections)

    # Modify relevant_information_oral sublist to include values from 'datum' and 'commissie' columns
    for sublist in relevant_information_oral:
        sublist.extend([row['Datum'], row['Commissie']])

    # Add relevant information to overall list
    relevant_information_oral_all.extend(relevant_information_oral)

    print(f"Page {index + 1} of {len(meeting_information_df)} processed.")

print("All pages processed.")

Page 1 of 1245 processed.


In [None]:
# Inspect results
len(relevant_information_oral_all)
relevant_information_oral_all[:10]
relevant_information_oral_all[-10:]

In [None]:
# Turn list into dataframe
oral_questions_df = pd.DataFrame(relevant_information_oral_all,
                                 columns = ["Parlementslid", "Minister", "Onderwerp",
                                            "Datum", "Commissie", "Url publicatie"])

In [None]:
# Inspect results
oral_questions_df

Then we map the parties of the members to this dataframe, using the earlier created dict. 

In [None]:
with open('../data/members_dict.pkl', 'rb') as file:
    members_dict = pickle.load(file)

# members_dict

In [None]:
# Invert the dictionary while handling lists
member_to_party = {}
for party, members_list in members_dict.items():
    for member in members_list:
        member_to_party[member] = party

# member_to_party

In [None]:
# Map vraagsteller to partij and create a new column 'partij'
oral_questions_df['Partij parlementslid'] = oral_questions_df['Parlementslid'].map(member_to_party)

# Inspect results
oral_questions_df.head()

We do the same for the party of the ministers.

In [None]:
with open('../data/minister_2_party.pkl', 'rb') as file:
    minister_2_party = pickle.load(file)

# minister_2_party

In [None]:
# Map vraagsteller to partij and create a new column 'partij'
oral_questions_df['Partij minister'] = oral_questions_df['Minister'].map(minister_2_party)

# Inspect results
oral_questions_df.head()
oral_questions_df.tail()

## Combine column of subject and url to get clickable markdown link

In [None]:
# Function to create markdown-style links
def create_markdown_link(row):
    return f"[{row['Onderwerp']}]({row['Url publicatie']})"

In [None]:
# Apply the function to create a new column 'Onderwerp (url)'
oral_questions_df["Onderwerp (url)"]= oral_questions_df.apply(create_markdown_link, axis=1)

# # Drop original columns
# questions_df = questions_df.drop(['Onderwerp'], axis=1)
# questions_df = questions_df.drop(['Url publicatie'], axis=1)

In [None]:
# Inspect results (include check to ensure that names assigned for all posts / competences)
oral_questions_df.head()
oral_questions_df.columns

# Save output

In [None]:
## Save details_questions_term_df for later use
# 1. Save as pkl
with open('../data/oral_questions_df.pkl', 'wb') as file:
    pickle.dump(oral_questions_df, file)

# 2. Save as csv
oral_questions_df.to_csv(path_or_buf = '../data/oral_questions_df.csv',
                               sep = ";",
                               encoding = "utf-16", # to ensure trema's are well handled (e.g. Koen Daniëls)
                               index = False)

# Dump

In [None]:
# def scrape_list_reports(url):
#     response = requests.get(url)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')


        
#         # Define regular expressions for matching Dutch month names and commission names
#         dutch_months_regex = r'(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)'
#         commission_names_regex = r'(Commissie|Gemeenschappelijke vergadering|Interparlementaire klimaatdialoog)'
        
#         # Find all table rows
#         rows = soup.find_all('tr')

#         relevant_information = []
        
#         # Iterate over each row to extract relevant information
#         for row in rows:
#             # Find the <a> tag with the title 'Kopieervriendelijke HTML versie'
#             link = row.find('a', title='Kopieervriendelijke HTML versie')
#             if link:
#                 # Extract URL
#                 url = link['href']
                
#                 # Extract date using regex
#                 date_match = re.search(dutch_months_regex, row.text)
#                 date = date_match.group() if date_match else None
                
#                 # Extract commission name using regex
#                 commission_match = re.search(commission_names_regex, row.text)
#                 commission_name = commission_match.group() if commission_match else None

#                 relevant_information.append([url, date, commission_name])

#     else:
#         print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [None]:
# # Extract h2 sections from the webpage
# h2_sections = extract_h2_sections_from_url(url_meeting)

# # Inspect results
# h2_sections[:10]

In [None]:
# # Actually clean sections
# h2_sections = clean_h2_sections(h2_sections)

# # Inspect results
# h2_sections[:20]

In [None]:
# # Extract names from h2 sections
# relevant_information_oral = extract_names_from_h2_sections(h2_sections)
# relevant_information_oral

In [None]:
# def extract_names_from_h2_sections(h2_sections):
#     # Regex pattern to extract the relevant names
#     # pattern = r'Question de ([^\s]+) à ([^\s]+) \(.*\)\s'
#     pattern = r'([^\s]+) à ([^\s]+) \(.*\)\s'
#     # pattern = r'Vraag van ([^\s]+) aan ([^\s]+) \(.*\)\s'
    
#     # Extract names from each h2 section
#     names = []
#     for h2_section in h2_sections:
#         match = re.search(pattern, str(h2_section))
#         if match:
#             names.append([match.group(1), match.group(2)])
    
#     return names

In [None]:
# # Extract h2 sections from the webpage
# h2_sections = extract_h2_sections_from_url(url_meeting)
# h2_sections

# # Extract names from h2 sections
# names = extract_names_from_h2_sections(h2_sections)
# names

# # Format the names as required
# formatted_names = [f'["{name[0]}", "{name[1]}"]' for name in names]
# formatted_names

In [None]:
# h2_sections

In [None]:
# def extract_info_oral_questions(h2_sections):
#     '''
#     Input: 
#     '- Sander Loones à Thomas Dermine (Relance, Investissements stratégiques\r\net Politique scientifique) sur "Le plan de relance et les demandes de\r\npaiement européennes" (55040913C)'
#     '- Sander Loones à Thomas Dermine (Relance, Investissements stratégiques\r\net Politique scientifique) sur "La demande de paiement des subventions\r\nRRF" (55040690C)'
#     '- Nabil Boukili aan Pierre-Yves Dermagne (VEM Economie en\r\nWerk) over "De doorvoer van militair materiaal met bestemming Israël"\r\n(55040716C)'
#     '04 Samengevoegde\r\nvragen van'
#     '- Roberto D\'Amico aan Pierre-Yves Dermagne (VEM Economie en\r\nWerk) over "Het rapport betreffende de vergelijking van de\r\nconsumptieprijzen" (55040505C)'
#     '- Sander Loones aan Thomas Dermine (Relance, Strategische\r\nInvesteringen en Wetenschapsbeleid) over "Het oordeel van de Europese\r\nCommissie inzake de relancegelden" (55040910C)'
#     '02 Vraag van Sander\r\nLoones aan Pierre-Yves Dermagne (VEM Economie en Werk) over "De\r\nbekendmaking van en de consensus over voorbereidende wetgevende teksten"\r\n(55040340C)',

    
#     Output: 
#     ["Sander Loones", "Thomas Dermine", ""Le plan de relance et les demandes de\r\npaiement européennes""]
#     ['Sander Loones', 'Thomas Dermine', '"La demande de paiement des subventions\r\nRRF"' ]
#     ['Nabil Boukili', 'Pierre-Yves Dermagne', '"De doorvoer van militair materiaal met bestemming Israël"\r\n']-
#     ['Roberto D\'Amico', 'Pierre-Yves Dermagne',  '"Het rapport betreffende de vergelijking van de\r\nconsumptieprijzen"'']
#     [' Sander Loones', 'Thomas Dermine', '"Het oordeel van de Europese\r\nCommissie inzake de relancegelden"']
#     ['Sander\r\nLoones', 'Pierre-Yves Dermagne', '"De\r\nbekendmaking van en de consensus over voorbereidende wetgevende teksten"']  
#     '''
    
#     # Regex pattern for the first type of input ('Vraag van')
#     pattern1 = r'''
#         '\d+\s+Question\s+de\s+          # Match the beginning of the string with a number followed by 'Question de'
#         (.+?)\s+à\s+(.+?)\s+sur\s+  # Capture group 1: match anything until 'aan', group 2: match anything until 'sur'
#         (.*?)$                         # Capture group 3: match anything until the end of the string
#     '''
    
#     # Regex pattern for the second type of input ('Samengevoegde vraag van -')
#     pattern2 = r'''
#         -\s*                            # Match a dash followed by any number of whitespace characters
#         (.+?)\s+à\s+(.+?)\s+sur\s+    # Capture group 1: match anything until 'à', group 2: match anything until 'sur'
#         (.*?)$                         # Capture group 3: match anything until the end of the string
#     '''

    
#     # Extract data using regex
#     extracted_data = []
#     for h2_section in h2_sections:
#         match1 = re.search(pattern1, h2_section, re.VERBOSE)
#         match2 = re.search(pattern2, h2_section, re.VERBOSE)
#         if match1:
#             extracted_data.append([match1.group(1).strip(), match1.group(2).strip(), match1.group(3).strip()])
#         elif match2:
#             extracted_data.append([match2.group(1).strip(), match2.group(2).strip(), match2.group(3).strip()])

    
#     return extracted_data

In [None]:
# # Extract names from h2 sections
# names = extract_info_oral_questions(h2_sections)
# names

Of the following list of strings, I want to delete some entries. The list contains duplicates (i.e. the same content but both in Dutch and in French). I only need to maintain 1 language version. To do so, I first group the different entries into sublists, each sublist starting with a digit. Then we can remove the second item of 2 consequtive sublists with the same digit.

In [None]:
# def group_subquestions

In [None]:
# def clean_second_language_versions():
    


#     # Iterate through the nested list
#     for sublist in nested_list:
#         previous_digit = None
#         for i in range(len(sublist)):
#             # Check if the item starts with a digit
#             if sublist[i][0].isdigit():
#                 current_digit = sublist[i][0]
#                 if previous_digit and current_digit == previous_digit:
#                     # Remove the item if it starts with the same digit as the previous one
#                     del sublist[i]
#                     # Decrement the index to account for the removed item
#                     i -= 1
#                 else:
#                     previous_digit = current_digit
    
#     # Flatten the nested list to get a flat list
#     flat_list = [item for sublist in nested_list for item in sublist]

In [None]:
# nested_list = []
# current_sublist = []

# for h2_section in h2_sections:
#     if h2_section[0].isdigit():
#         # Start a new sublist
#         if current_sublist:
#             nested_list.append(current_sublist)
#         current_sublist = [h2_section]
#     else:
#         # Append to the current sublist
#         current_sublist.append(h2_section)

# # Append the last sublist
# if current_sublist:
#     nested_list.append(current_sublist)

# nested_list

## Using 'oraspr' tags

In [None]:
# # Function to fetch HTML content from a URL
# def fetch_html_content(url):
#     response = requests.get(url)
#     if response.status_code == 200:
#         return response.text
#     else:
#         print("Failed to fetch HTML content from the URL:", url)
#         return None

In [None]:
# # Fetch HTML content from the URL
# html_content = fetch_html_content(url_meeting)

# names_all = []

# if html_content:
#     # Parse HTML content
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Extract names from elements with class 'oraspr'
#     names = [element.text.strip() for element in soup.find_all(class_='oraspr')]


#     # Remove carriage returns and newline characters (replace by space)
#     # Remove elements from the list if they match the regex pattern (i.e. are an indicator int)
#         # Explanation of the regular expression:
#         # ^         - Asserts the start of the string
#         # \d+\.\d+  - Matches one or more digits followed by a dot and another digit
#         # re.match() function checks if the pattern matches at the beginning of the string
#     cleaned_names = [name.replace('\r\n', ' ').strip() for name in names if not re.match(r'^\d+\.\d+', name)]
    
#     # Remove empty strings
#     cleaned_names = [name for name in cleaned_names if name != '']
    
#     names_all.extend(cleaned_names)

# print(names_all)