In [1]:
# load requirements
import pandas as pd
from datetime import datetime
import xml.etree.ElementTree as ET
from atproto import Client

In [2]:
# parse the XML file
tree = ET.parse('../data/raw_data/MDB_STAMMDATEN.XML')
root = tree.getroot()

In [9]:
# dictionary comprehension to find the encoding of the latest and second latest election period
all_phases = {
    int(wp.findtext('WP'))
    for mdb in root.findall('MDB')
    for wp in mdb.findall('.//WAHLPERIODE')
    if wp.find('WP') is not None
}
second_latest = sorted(all_phases)[-2]
latest_wp = max(all_phases)

In [15]:
# create the df by looping over all members

# create empty lists to store the data
data_latest = []
data_second_latest = []

# now loop over all members and store relevant information
for mdb in root.findall("MDB"):

    # extract general information about the mp
    mdb_id = mdb.findtext("ID")
    name_element = mdb.find(".//NAME")
    first_name = name_element.findtext("VORNAME")
    surname = name_element.findtext("NACHNAME")
    acad_title = name_element.findtext("AKAD_TITEL")
    full_name = f"{acad_title} {first_name} {surname}".strip()
    bio_element = mdb.find("BIOGRAFISCHE_ANGABEN")
    birthdate = bio_element.findtext("GEBURTSDATUM")
    gender = bio_element.findtext("GESCHLECHT")
    party = bio_element.findtext("PARTEI_KURZ")
    profession = bio_element.findtext("BERUF")

    # check for participation in latest or second latest election period and append data if fits
    for wp in mdb.findall(".//WAHLPERIODE"):
        institution_element = wp.find("INSTITUTIONEN")
        if int(wp.findtext("WP")) == latest_wp:
            # extract information about the specific election
            district_number = wp.findtext("WKR_NUMMER")
            district_name = wp.findtext("WKR_NAME")
            mandate = wp.findtext("MANDATSART")
            data_latest.append({
                "id": mdb_id,
                "full_name": full_name,
                "birthdate": birthdate,
                "gender": gender,
                "party": party,
                "profession": profession,
                "district_number": district_number,
                "district_name": district_name,
                "mandate": mandate
            })
        elif int(wp.findtext("WP")) == second_latest:
            # extract information about the specific election
            district_number = wp.findtext("WKR_NUMMER")
            district_name = wp.findtext("WKR_NAME")
            mandate = wp.findtext("MANDATSART")
            data_second_latest.append({
                "id": mdb_id,
                "full_name": full_name,
                "birthdate": birthdate,
                "gender": gender,
                "party": party,
                "profession": profession,
                "district_number": district_number,
                "district_name": district_name,
                "mandate": mandate
            })


# convert these to a df and show first rows of one of them
mps_df_2021 = pd.DataFrame(data_second_latest).sort_values(by=["party", "full_name"])
mps_df_2025 = pd.DataFrame(data_latest).sort_values(by=["party", "full_name"])
mps_df_2025.head()

Unnamed: 0,id,full_name,birthdate,gender,party,profession,district_number,district_name,mandate
512,11005504,Achim Köhler,22.03.1964,männlich,AfD,,277.0,Rhein-Neckar,Landesliste
416,11005408,Adam Balten,09.11.1983,männlich,AfD,Mechatronik-Ingenieur,112.0,Wesel I,Landesliste
412,11005404,Alexander Arpaschi,24.05.1970,männlich,AfD,,273.0,Rastatt,Landesliste
629,11005622,Alexis L. Giersch,25.09.1963,männlich,AfD,,,,Landesliste
169,11004674,Andreas Bleck,17.04.1988,männlich,AfD,Büroleiter,196.0,Neuwied,Landesliste


In [54]:
# create empty lists to store the data
data_latest = []
data_second_latest = []

# now loop over all members and store relevant information
for mdb in root.findall("MDB"):

    # extract general information about the mp
    mdb_id = mdb.findtext("ID")
    name_element = mdb.find(".//NAME")
    first_name = (name_element.findtext("VORNAME") or "").strip()
    surname = (name_element.findtext("NACHNAME") or "").strip()
    acad_title = (name_element.findtext("AKAD_TITEL") or "").strip()
    full_name = f"{acad_title} {first_name} {surname}".strip()
    bio_element = mdb.find("BIOGRAFISCHE_ANGABEN")
    birthdate = datetime.strptime(
        bio_element.findtext("GEBURTSDATUM"), "%d.%m.%Y")
    gender = bio_element.findtext("GESCHLECHT")
    party = bio_element.findtext("PARTEI_KURZ")
    profession = bio_element.findtext("BERUF")

    # loop over election periods
    for wp in mdb.findall(".//WAHLPERIODE"):

        # extract the current election period
        wp_num = int(wp.findtext("WP"))

        # check if the MP has served in the research period, always include for the current period
        if wp_num == latest_wp:
            include = True
        elif wp_num == second_latest:
            wp_end = datetime.strptime(wp.findtext("MDBWP_BIS"), "%d.%m.%Y")
            include = wp_end >= datetime(2024, 6, 1)
        else:
            include = False

        # only proceed if MP has served in the research period
        if include:
            district_number = wp.findtext("WKR_NUMMER")
            district_name = wp.findtext("WKR_NAME")
            mandate = wp.findtext("MANDATSART")

            # construct the data entry
            entry = {
                "id": mdb_id,
                "full_name": full_name,
                "birthdate": birthdate,
                "gender": gender,
                "party": party,
                "profession": profession,
                "district_number": district_number,
                "district_name": district_name,
                "mandate": mandate
            }

            # append it to the correct list
            if wp_num == latest_wp and include:
                data_latest.append(entry)
            elif wp_num == second_latest and include:
                data_second_latest.append(entry)

# convert these to a df and show first rows of one of them
mps_df_2021 = pd.DataFrame(data_second_latest).sort_values(by=["party", "full_name"])
mps_df_2025 = pd.DataFrame(data_latest).sort_values(by=["party", "full_name"])
mps_df_2021.head()


Unnamed: 0,id,full_name,birthdate,gender,party,profession,district_number,district_name,mandate
326,11004727,Albrecht Glaser,1942-01-08,männlich,AfD,"Stadtkämmerer a. D., Bürgermeister a. D.",170,Schwalm-Eder,Landesliste
289,11004674,Andreas Bleck,1988-04-17,männlich,AfD,Büroleiter,197,Neuwied,Landesliste
582,11005126,Barbara Lenk,1982-10-04,weiblich,AfD,Dipl.-Bibliothekarin (FH),155,Meißen,Direktwahl
423,11004905,Beatrix Storch,1971-05-27,weiblich,AfD,Anwältin,75,Berlin-Mitte,Landesliste
656,11005203,Bernd Schattner,1968-06-27,männlich,AfD,Geschäftsführer,211,Südpfalz,Landesliste


In [57]:
# create a client instance
client = Client()

# get the app password
with open("app_password.txt", "r") as f:
    app_password = f.read()

handle = "mxwlnd.bsky.social"

# login with my credentials
client.login(handle, app_password)

ProfileViewDetailed(did='did:plc:5sqqg66p7muc7ogbp6xx4sw6', handle='mxwlnd.bsky.social', associated=ProfileAssociated(chat=None, feedgens=0, labeler=False, lists=0, starter_packs=0, py_type='app.bsky.actor.defs#profileAssociated'), avatar='https://cdn.bsky.app/img/avatar/plain/did:plc:5sqqg66p7muc7ogbp6xx4sw6/bafkreigwrjedzb7jvmowkn6fbe2atbnlwecsa4ouk5wpz54eg6rqkvayrq@jpeg', banner=None, created_at='2025-05-19T19:28:35.738Z', description=None, display_name='', followers_count=2, follows_count=1, indexed_at='2025-05-19T19:28:35.738Z', joined_via_starter_pack=None, labels=[], pinned_post=None, posts_count=0, verification=None, viewer=ViewerState(blocked_by=False, blocking=None, blocking_by_list=None, followed_by=None, following=None, known_followers=None, muted=False, muted_by_list=None, py_type='app.bsky.actor.defs#viewerState'), py_type='app.bsky.actor.defs#profileViewDetailed')

In [58]:
# define a function that gives me potential bsky handle
def find_first_handle(name):
    response = client.app.bsky.actor.search_actors({'term': name})
    actors = response["actors"]
    if len(actors) >= 1:
        handle = actors[0]["handle"]
    else:
        handle = ""
    return handle

# apply this to all names of MPs and store result in new column
mps_df_2021["potential_handle"] = mps_df_2021["full_name"].apply(find_first_handle)
mps_df_2025["potential_handle"] = mps_df_2025["full_name"].apply(find_first_handle)

In [59]:
# export the dfs to csv
mps_df_2021.to_csv("../data/raw_data/bundestag_mps_2021_potential_handle.csv")
mps_df_2025.to_csv("../data/raw_data/bundestag_mps_2025_potential_handle.csv")