In [46]:
import xml.etree.ElementTree as ET
import duckdb
import os
from duckdb import ConstraintException
from pathlib import Path
from tqdm import tqdm
from datetime import datetime
import re

In [47]:
# Connect to sql database
con = duckdb.connect(database='german-parliament', read_only=False)

In [48]:
import xml.etree.ElementTree as ET
from pathlib import Path

def parse_xml(file_path: str | Path):
    """
    Parses an XML file, handling potential encoding issues with UTF-8 BOM.

    Args:
        file_path (str | Path): The path to the XML file to be parsed.

    Returns:
        ElementTree.Element: The root element of the parsed XML tree, or None if parsing fails.
    """
    try:
        # Read raw bytes from the file
        data = Path(file_path).read_bytes()
        # Check for UTF-8 BOM (Byte Order Mark), strips it if present
        if data.startswith(b"\xef\xbb\xbf"):
            data = data[3:]
        # Check for wrong encoded BOM, strips it if present
        if data.startswith(b"\xc3\xaf\xc2\xbb\xc2\xbf"):
            data = data[6:]
        # Decode the bytes to a string and parse the XML
        xml_text = data.decode("utf-8")
        return ET.fromstring(xml_text)

    except (ET.ParseError, UnicodeDecodeError) as exc:
        print(f"Error parsing XML file {file_path}: {exc}")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    return None

In [36]:
path = "../data/plenary_minutes/wahlperiode_19/19_001_2017-10-24.xml"
root = parse_xml(path)
# Get metadata
kopfdaten = root.find("vorspann").find("kopfdaten")
# Get period and session no
plenarprotokoll_nummer = kopfdaten.find("plenarprotokoll-nummer")
period = plenarprotokoll_nummer.find("wahlperiode").text
session = plenarprotokoll_nummer.find("sitzungsnr").text
# Get the date
raw_date = kopfdaten.find("veranstaltungsdaten").find('datum').attrib.get("date")
date = datetime.strptime(raw_date, "%d.%m.%Y")
# print(attributes)
print(ET.tostring(verantstaltungsdaten,encoding="unicode"))
print(f"Period: {period}, Session: {session}, Date: {date}")

<veranstaltungsdaten><ort>Berlin</ort>, <datum date="24.10.2017">Dienstag, den 24. Oktober 2017</datum></veranstaltungsdaten>
    
Period: 19, Session: 1, Date: 2017-10-24 00:00:00


In [52]:
tagesordnungspunkte = root.find("sitzungsverlauf").findall("tagesordnungspunkt")
for top in tagesordnungspunkte:
    top_id_text = top.attrib.get("id")
    if top_id_text is None:
        print("No ID found for this Tagesordnungspunkt.")
        continue
    else:
        print(f"Top ID: {top_id_text}")
        match = re.search(r"\d+$", top_id_text)      # last run of digits at the end
        top_id = int(match.group()) if match else None
        print(f"Top ID: {top_id}")

    print(ET.tostring(top, encoding="unicode"))

No ID found for this Tagesordnungspunkt.
No ID found for this Tagesordnungspunkt.
No ID found for this Tagesordnungspunkt.
No ID found for this Tagesordnungspunkt.
No ID found for this Tagesordnungspunkt.
No ID found for this Tagesordnungspunkt.
