In [2]:
import requests
from bs4 import BeautifulSoup
import json

In [6]:
def request_data(collection_name:str, offset:int=0) -> requests.Response:
    """
    Function to request data from the Bundestag website.

    Args:
        collection_name (str): The name of the collection to request data from. (e.g. 866354-866354)
        offset (int): The offset for pagination. Default is 0.

    Returns:
        requests.Response: The response object from the request.
    """

    # Construct the URL for the request
    url = f"https://www.bundestag.de/ajax/filterlist/de/services/opendata/{collection_name}?limit=10&noFilterSet=true&offset={offset}"

    # Set the headers for the request
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
        "Referer": "https://www.bundestag.de/services/opendata",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Mobile Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    # Make the request to the Bundestag website
    response = requests.get(url, headers=headers)

    if response.ok:
        return response
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None

def parse_response(response:requests.Response) -> list[dict[str, str]]:
    """
    Function to parse the response from the Bundestag website. All single entries of parliamentary minutes, denoted by a <tr> tag are stored in a list.

    Args:
        response (requests.Response): The response object from the request.

    Returns:
        list: A list of dictionaries containing the title, link, and description of each plenary minute.
    """

    # Parse the response content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')
    # List to hold data
    documents = []

    for row in rows:
        # Get the title
        title_tag = row.select_one('td[data-th="Titel"] div.bt-documents-description p strong')
        title = title_tag.text.strip() if title_tag else "No title found"

        # Get the link to the XML document
        link_tag = row.select_one('td[data-th="Titel"] ul.bt-linkliste li a.bt-link-dokument')
        link = link_tag['href'] if link_tag else "No link found"

        # Optional: Extract additional info such as file size
        description = link_tag.text.strip() if link_tag else "No description"

        # Append to documents list
        documents.append({
            'title': title,
            'link': link,
            'description': description
        })

    return documents

def write_json(data:list[dict[str, str]], filename:str):
    """
    Convert the data list to JSON and save it to a file

    Args:
        data (list): List of documents to save.
        filename (str): Name of the file to save the data to.

    Returns:
        None
    """
    with open(f'{filename}', 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

    print(f"Documents successfully saved to {filename}")


def scrape_collection(file_name:str, collection_name:str):
    """
    Function to scrape a collection of parliamentary minutes from the Bundestag website.

    Args:
        collection_name (str): The name of the collection to scrape data from. (e.g. 866354-866354)
        file_name (str): The name of the file to save the data to.

    Returns:
        None
    """
    # Initialize an empty list to hold all documents
    all_documents = []
    # Initialize offset for pagination
    offset = 0

    while True:
        response =  request_data(collection_name, offset)

        if response is None:
            break # Exit if the request failed
        else:
            # Parse the response and extract documents
            documents = parse_response(response)
            all_documents.extend(documents)
            # Check if there are more documents to fetch
            if len(documents) < 10:
                break # Exit if there are no more documents
            else:
                offset += 10 # Increment the offset for the next request

    # Save the data to a json file
    write_json(all_documents, file_name)


In [8]:
def main():
    """Main function to scrape all plenary minutes from the Bundestag website. It scrapes the data from the collections defined in the files dictionary and saves them to JSON files."""
    # All plenary minutes are stored in collections. The minutes from the previous legislative periods are can be found as a zip on the Bundestag website (https://www.bundestag.de/services/opendata)
    files = {1058442: "Plenarprotokolle_21_wahlperiode", 866354: "Plenarprotokolle_20_wahlperiode", 543410: "Plenarprotokolle_19_wahlperiode"}

    # Loop through each collection and scrape the data
    for key, value in files.items():
        # Create a unique collection name for each collection. This is important for the HTTP request to the Bundestag website.
        collection_name = f"{key}-{key}"
        filename = f"{value}.json"
        print(f"Scraping collection {filename}...")
        scrape_collection(filename, collection_name)
        print(f"Finished scraping collection {filename}.")

if __name__ == "__main__":
    main()

Scraping collection Plenarprotokolle_21_wahlperiode.json...
Documents successfully saved to Plenarprotokolle_21_wahlperiode.json
Finished scraping collection Plenarprotokolle_21_wahlperiode.json.
Scraping collection Plenarprotokolle_20_wahlperiode.json...
Documents successfully saved to Plenarprotokolle_20_wahlperiode.json
Finished scraping collection Plenarprotokolle_20_wahlperiode.json.
Scraping collection Plenarprotokolle_19_wahlperiode.json...
Documents successfully saved to Plenarprotokolle_19_wahlperiode.json
Finished scraping collection Plenarprotokolle_19_wahlperiode.json.
