In [1]:
import json
from bs4 import BeautifulSoup
import duckdb as db
import re
import numpy as np
import pandas as pd
from datetime import datetime
import locale
import requests
import os
from tqdm import tqdm

In [2]:
# Connect to the DuckDB database
con = db.connect(database='plenary_minutes.duckdb', read_only=False)

#Set locale for date parsing
locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

'de_DE.UTF-8'

In [40]:
def create_tables(reset_db):
    """
    Create tables in the DuckDB database for storing plenary minutes data.

    Args:
        reset_db (bool): If True, the existing database will be reset before creating tables.
    """

    if reset_db:
        con.execute("DROP TABLE IF EXISTS plenary_minutes")

    con.execute("""
        CREATE TABLE IF NOT EXISTS plenary_minutes (
            title VARCHAR PRIMARY KEY,
            link VARCHAR,
            sitting INT,
            period INT,
            date DATE,
            description VARCHAR
        )
    """)
    print("Tables created successfully.")

In [41]:
def fill_database(period):
    """
    Fills the database with the scraped data from the bundestag website for a given period.

    Args:
        period (int): The period of the plenary minutes to be saved.

    Returns:
        None
    """
    path = f"../data/bundestag_open_data_infos/Plenarprotokolle_{period}_wahlperiode.json"
    with open(path, 'r', encoding='utf-8') as file:
        documents = json.load(file)

    # Display the data
    for doc in documents:
        try:
            title = doc['title']
            link = doc['link']
            description = doc['description']
            if link == "No link found":
                continue # Skip if no link is found
            else:
                # Extract sitting number
                sitting_match = re.search(r'der (\d+)\. Sitzung', title)
                sitting_number = sitting_match.group(1) if sitting_match else None

                # Extract date
                # 💡 Fix missing space between month and year (e.g., "Mai2022" -> "Mai 2022")
                # This handles all cases like "Mai2022", "Juni2021", etc.
                title = re.sub(r'([a-zäöüßA-ZÄÖÜ]+)(\d{4})', r'\1 \2', title)

                # Extract date in German format (e.g., "dem 15. Mai 2022")
                date_match = re.search(r'dem ([\d\.]+\s\w+\s\d{4})', title)
                german_date_str = date_match.group(1) if date_match else None
                # Convert German date format to datetime object
                date_obj = datetime.strptime(german_date_str, "%d. %B %Y")

                # Convert datetime object to standard format YYYY-MM-DD
                date = date_obj.strftime("%Y-%m-%d")

                # Insert data into the database
                sql = f"""
                    INSERT INTO plenary_minutes (title, link, sitting, period, date, description)
                    VALUES ('{title}', '{link}', {sitting_number}, {period}, '{date}', '{description}')
                """
                con.execute(sql)
        except Exception as e:
            print(f"Error processing document: {doc}")
            print(f"Error message: {e}")

def build_database(reset_db=False):
    """
    Build the database with the scraped data from the bundestag website. This is used to save the xml data.

    Returns:
        None
    """
    # Create tables
    create_tables(reset_db)

    # Fill database with data for each period
    periods = [19,20,21]
    for period in periods:
        fill_database(period)





In [42]:
def download_xml(link):
    """
    Download the XML file from the given link.

    Args:
        link (str): The URL of the XML file to download.

    Returns:
        str: The content of the XML file as a string.
    """

    response = requests.get(link)
    if response.status_code == 200:
        xml_content = response.text
        if "Ã" in xml_content or "Â" in xml_content:
            xml_content = xml_content.encode("latin-1").decode("utf-8")
        # We encode, decode here, to fix faulty decoded xml served by 
        return xml_content
    else:
        print(f"Failed to download {link}. Status code: {response.status_code}")
        return None

def save_xml_to_file(xml_content, folder, filename):
    """
    Save the XML content to a file.

    Args:
        xml_content (str): The XML content to save.
        folder (str): The folder where the file will be saved.
        filename (str): The name of the file to save the XML content to.

    Returns:
        None
    """

    if not os.path.exists(folder):
        os.makedirs(folder)

    with open(os.path.join(folder, filename), 'w') as file:
        file.write(xml_content)

def get_xml_files():
    """
    Get the list of XML files from the plenary_minutes table in the database.

    Returns:
        list: A list of tuples containing the link to each XML file.
    """
    periods = [19, 20, 21]  # Define the periods you want to query
    for i in periods:
        df = con.execute(f"SELECT * FROM plenary_minutes WHERE period = {i}").fetchdf()
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            link = row['link']
            filename = f"{i}_{row['sitting']:03}_{row['date'].date()}.xml"
            xml_content = download_xml(link)
            if xml_content:
                save_xml_to_file(xml_content, f"../data/plenary_minutes/wahlperiode_{i}", filename)
            else:
                print(f"Failed to download {filename} for period {i}.")



In [43]:
if __name__ == "__main__":
    build_database(reset_db=False)
    get_xml_files()

Tables created successfully.


Processing rows: 100%|██████████| 239/239 [00:24<00:00,  9.88it/s]
Processing rows: 100%|██████████| 214/214 [00:22<00:00,  9.53it/s]
Processing rows: 100%|██████████| 6/6 [00:00<00:00, 11.89it/s]


In [None]:
 con.execute("SELECT * FROM plenary_minutes where sitting = 1").fetchdf()

In [3]:
con.close()