In [4]:
# load requirements
import pandas as pd
import xml.etree.ElementTree as ET

In [5]:
# Parse the XML file
tree = ET.parse('../data/raw_data/MDB_STAMMDATEN.XML')
root = tree.getroot()

In [8]:
# dictionary comprehension to find the encoding of the latest election period
all_phases = {
    int(wp.findtext('WP'))
    for mdb in root.findall('MDB')
    for wp in mdb.findall('.//WAHLPERIODE')
    if wp.find('WP') is not None
}
latest_wp = max(all_phases)

In [19]:
# create the df by looping over all members

# create empty list to store the data
data = []

# now loop over all members and store relevant information
for mdb in root.findall("MDB"):

    # extract general information about the mp
    mdb_id = mdb.findtext("ID")
    name_element = mdb.find(".//NAME")
    first_name = name_element.findtext("VORNAME")
    surname = name_element.findtext("NACHNAME")
    acad_title = name_element.findtext("AKAD_TITEL")
    full_name = f"{acad_title} {first_name} {surname}".strip()
    bio_element = mdb.find("BIOGRAFISCHE_ANGABEN")
    birthdate = bio_element.findtext("GEBURTSDATUM")
    gender = bio_element.findtext("GESCHLECHT")
    party = bio_element.findtext("PARTEI_KURZ")
    profession = bio_element.findtext("BERUF")

    # check for participation in latest election period and append data if fits
    for wp in mdb.findall(".//WAHLPERIODE"):
        if int(wp.findtext("WP")) == latest_wp:
            # extract information about the specific election
            district_number = wp.findtext("WKR_NUMMER")
            district_name = wp.findtext("WKR_NAME")
            mandate = wp.findtext("MANDATSART")
            data.append({
                "id": mdb_id,
                "full_name": full_name,
                "birthdate": birthdate,
                "gender": gender,
                "party": party,
                "profession": profession,
                "district_number": district_number,
                "district_name": district_name,
                "mandate": mandate
            })

# convert this to a df and show the first rows
mps_df = pd.DataFrame(data).sort_values(by=["party", "full_name"])
mps_df.head()

Unnamed: 0,id,full_name,birthdate,gender,party,profession,district_number,district_name,mandate
512,11005504,Achim Köhler,22.03.1964,männlich,AfD,,277.0,Rhein-Neckar,Landesliste
416,11005408,Adam Balten,09.11.1983,männlich,AfD,Mechatronik-Ingenieur,112.0,Wesel I,Landesliste
412,11005404,Alexander Arpaschi,24.05.1970,männlich,AfD,,273.0,Rastatt,Landesliste
629,11005622,Alexis L. Giersch,25.09.1963,männlich,AfD,,,,Landesliste
169,11004674,Andreas Bleck,17.04.1988,männlich,AfD,Büroleiter,196.0,Neuwied,Landesliste


In [21]:
# export the df to csv
mps_df.to_csv("../data/clean_data/bundestag_mps.csv")