# Parse Bundestag Metadata

This Notebook implements an custom way to parse the Bundestag metadata. The open discourse way misses some important aspects and therefore a more tailored approach was needed. This way of parsing is related, but adds some important features to capture all data reliably.

## Imports

In [None]:
import os
import xml.etree.ElementTree as ET
from dataclasses import asdict, dataclass
from datetime import datetime

import pandas as pd

## Helper Functions

In [None]:
def _txt(parent: ET.Element | None, tag: str) -> str:
    """Safely fetch the text content of a child tag or return an empty string.

    Args:
        parent (ET.Element | None): The parent XML element.
        tag (str): The tag name to search for.

    Returns:
        str: The text content of the child tag or an empty string.
    """

    if parent is None:
        return ""
    node = parent.find(tag)
    return (node.text or "").strip() if node is not None else ""


def _parse_date(value: str) -> datetime | None:
    """Parse dates of the form 'DD.MM.YYYY' into a datetime.

    Args:
        value (str): The date string to parse.

    Returns:
        datetime | None: The parsed date or None if parsing failed.
    """

    value = (value or "").strip()
    if not value:
        return None
    for fmt in ("%d.%m.%Y",):
        try:
            return datetime.strptime(value, fmt)
        except ValueError:
            continue
    return None

def _choose_name_variant(namen_el: ET.Element | None) -> ET.Element | None:
    """Pick the most relevant <NAME> variant.

    Args:
        namen_el (ET.Element | None): The <NAMEN> element to search within.

    Returns:
        ET.Element | None: The chosen <NAME> element or None if none found.
    """

    if namen_el is None:
        return None
    variants = namen_el.findall("NAME")
    if not variants:
        return None

    # 1) Prefer empty HISTORIE_BIS
    current = [v for v in variants if not _txt(v, "HISTORIE_BIS")]
    if current:
        # If multiple, choose latest HISTORIE_VON
        def key_fn(v: ET.Element):
            return _parse_date(_txt(v, "HISTORIE_VON")) or datetime.min

        return sorted(current, key=key_fn)[-1]

    # 2) Otherwise choose latest HISTORIE_VON overall
    def key_fn_any(v: ET.Element):
        return _parse_date(_txt(v, "HISTORIE_VON")) or datetime.min

    return sorted(variants, key=key_fn_any)[-1]Í

## Helper Classes

In [None]:
@dataclass
class Member:
    id: str
    last_name: str
    first_name: str
    academic_title: str
    birth_date: datetime | None
    birth_place: str
    birth_country: str
    death_date: datetime | None
    gender: str
    marital_status: str
    aristocracy: str
    profession: str
    party_short: str


@dataclass
class Term:
    member_id: str
    wp: int | None
    mdbwp_von: datetime | None
    mdbwp_bis: datetime | None
    wkr_nummer: str
    wkr_name: str
    wkr_land: str
    liste: str
    mandatsart: str

## Parse

In [None]:
XML_PATH = os.path.join("data", "raw", "MDB_STAMMDATEN.XML")

In [None]:
tree = ET.parse(XML_PATH)
root = tree.getroot()

# create list for members and electoral terms
members: list[Member] = list()
terms: list[Term] = list()

# iterate over MDB elements to parse member and term data
for mdb in root.findall("MDB"):
    member_id = _txt(mdb, "ID")

    namen = _choose_name_variant(mdb.find("NAMEN"))
    biog = mdb.find("BIOGRAFISCHE_ANGABEN")

    member = Member(
        id=member_id,
        last_name=_txt(namen, "NACHNAME"),
        first_name=_txt(namen, "VORNAME"),
        academic_title=_txt(namen, "AKAD_TITEL"),
        birth_date=_parse_date(_txt(biog, "GEBURTSDATUM")),
        birth_place=_txt(biog, "GEBURTSORT"),
        birth_country=_txt(biog, "GEBURTSLAND"),
        death_date=_parse_date(_txt(biog, "STERBEDATUM")),
        gender=_txt(biog, "GESCHLECHT"),
        marital_status=_txt(biog, "FAMILIENSTAND"),
        aristocracy=_txt(biog, "RELIGION"),
        profession=_txt(biog, "BERUF"),
        party_short=_txt(biog, "PARTEI_KURZ"),
    )
    members.append(member)

    wps = mdb.find("WAHLPERIODEN")
    if wps is not None:
        for wp in wps.findall("WAHLPERIODE"):
            terms.append(
                Term(
                    member_id=member_id,
                    wp=int(_txt(wp, "WP")) if _txt(wp, "WP").isdigit() else None,
                    mdbwp_von=_parse_date(_txt(wp, "MDBWP_VON")),
                    mdbwp_bis=_parse_date(_txt(wp, "MDBWP_BIS")),
                    wkr_nummer=_txt(wp, "WKR_NUMMER"),
                    wkr_name=_txt(wp, "WKR_NAME"),
                    wkr_land=_txt(wp, "WKR_LAND"),
                    liste=_txt(wp, "LISTE"),
                    mandatsart=_txt(wp, "MANDATSART"),
                )
            )

# convert to DataFrames
members_df = pd.DataFrame([asdict(m) for m in members])
terms_df = pd.DataFrame([asdict(t) for t in terms])

# convert datetime to date for nicer display
for df in (members_df, terms_df):
    for col in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].dt.date

In [None]:
# save dataframes
members_df.to_csv("data/processed/members.csv", index=False)

terms_df.to_csv("data/processed/terms.csv", index=False)