In [1]:
# load requirements
import pandas as pd
from datetime import datetime
import xml.etree.ElementTree as ET
from atproto import Client

In [2]:
# parse the XML file
tree = ET.parse('../data/raw_data/MDB_STAMMDATEN.XML')
root = tree.getroot()

In [3]:
# dictionary comprehension to find the encoding of the latest and second latest election period
all_phases = {
    int(wp.findtext('WP'))
    for mdb in root.findall('MDB')
    for wp in mdb.findall('.//WAHLPERIODE')
    if wp.find('WP') is not None
}
second_latest = sorted(all_phases)[-2]
latest_wp = max(all_phases)

In [47]:
# create empty lists to store the data
data_latest = []
data_second_latest = []

# now loop over all members and store relevant information
for mdb in root.findall("MDB"):

    # extract general information about the mp
    mdb_id = mdb.findtext("ID")
    name_element = mdb.find(".//NAME")
    first_name = (name_element.findtext("VORNAME") or "").strip()
    surname = (name_element.findtext("NACHNAME") or "").strip()
    acad_title = (name_element.findtext("AKAD_TITEL") or "").strip()
    full_name = f"{acad_title} {first_name} {surname}".strip()
    bio_element = mdb.find("BIOGRAFISCHE_ANGABEN")
    birthdate = datetime.strptime(
        bio_element.findtext("GEBURTSDATUM"), "%d.%m.%Y")
    gender = bio_element.findtext("GESCHLECHT")
    profession = bio_element.findtext("BERUF")

    # loop over election periods
    for wp in mdb.findall(".//WAHLPERIODE"):

        # extract the current election period
        wp_num = int(wp.findtext("WP"))

        # check if the MP has served in the research period, always include for the current period
        if wp_num == latest_wp:
            include = True
            end_period = datetime.max # assign arbitraryily high end date (matters for later comparison)
        elif wp_num == second_latest:
            wp_end = datetime.strptime(wp.findtext("MDBWP_BIS"), "%d.%m.%Y")
            cutoff = datetime(2024, 6, 1)
            include = wp_end >= datetime(2024, 6, 1)
            end_period = datetime(2025, 3, 25) # assign the end date of the legislature
        else:
            include = False

        # only proceed if MP has served in the research period
        if include:

            # get general info about the electoral district and mandate
            district_number = wp.findtext("WKR_NUMMER")
            district_name = wp.findtext("WKR_NAME")
            mandate = wp.findtext("MANDATSART")

            # find all institution tags
            institutions_element = wp.find("INSTITUTIONEN")

            # loop over the institutions for the respective wp
            for institution in institutions_element.findall("INSTITUTION"):
                institution_kind = institution.findtext("INSART_LANG")

                # only proceed if the info is about the faction
                if institution_kind == "Fraktion/Gruppe":

                    # extract faction name, start and end date
                    faction_name = institution.findtext("INS_LANG")
                    starting_inst = datetime.strptime(
                        institution.findtext("MDBINS_VON"), "%d.%m.%Y")
                    
                    # end date is not available if MP served until the end of term there or it is still ongoing
                    end_inst_str = institution.findtext("MDBINS_BIS")
                    if end_inst_str:
                        end_inst = datetime.strptime(end_inst_str, "%d.%m.%Y")
                    else:
                        end_inst = end_period

                    # only include information from factions MP has served in after the begin of the research period
                    if wp_num == second_latest and end_inst > cutoff:
                        entry = {
                            "id": mdb_id,
                            "full_name": full_name,
                            "birthdate": birthdate,
                            "gender": gender,
                            "faction": faction_name,
                            "faction_start": starting_inst,
                            "faction_end": end_inst,
                            "profession": profession,
                            "district_number": district_number,
                            "district_name": district_name,
                            "mandate": mandate
                        }
                        data_second_latest.append(entry)
                    
                    # include everything for the current electoral cycle
                    elif wp_num == latest_wp:
                        entry = {
                            "id": mdb_id,
                            "full_name": full_name,
                            "birthdate": birthdate,
                            "gender": gender,
                            "faction": faction_name,
                            "faction_start": starting_inst,
                            "faction_end": None,
                            "profession": profession,
                            "district_number": district_number,
                            "district_name": district_name,
                            "mandate": mandate
                        }
                        data_latest.append(entry)

# convert these to a df and show first rows of one of them
mps_df_2021 = pd.DataFrame(data_second_latest).sort_values(
    by=["faction", "full_name"])
mps_df_2025 = pd.DataFrame(data_latest).sort_values(
    by=["faction", "full_name"])
mps_df_2021.head()


Unnamed: 0,id,full_name,birthdate,gender,faction,faction_start,faction_end,profession,district_number,district_name,mandate
329,11004727,Albrecht Glaser,1942-01-08,männlich,Fraktion Alternative für Deutschland,2021-10-26,2025-03-25,"Stadtkämmerer a. D., Bürgermeister a. D.",170,Schwalm-Eder,Landesliste
292,11004674,Andreas Bleck,1988-04-17,männlich,Fraktion Alternative für Deutschland,2021-10-26,2025-03-25,Büroleiter,197,Neuwied,Landesliste
586,11005126,Barbara Lenk,1982-10-04,weiblich,Fraktion Alternative für Deutschland,2021-10-26,2025-03-25,Dipl.-Bibliothekarin (FH),155,Meißen,Direktwahl
427,11004905,Beatrix Storch,1971-05-27,weiblich,Fraktion Alternative für Deutschland,2021-10-26,2025-03-25,Anwältin,75,Berlin-Mitte,Landesliste
660,11005203,Bernd Schattner,1968-06-27,männlich,Fraktion Alternative für Deutschland,2021-10-26,2025-03-25,Geschäftsführer,211,Südpfalz,Landesliste


In [57]:
# rename the factions to shorter party names

# create dictionaries that define the replacement
faction_renaming_2021 = {
    "Fraktion der Christlich Demokratischen Union/Christlich - Sozialen Union": "CDU/CSU",
    "Fraktion Alternative für Deutschland": "AfD",
    "Fraktion der Sozialdemokratischen Partei Deutschlands": "SPD",
    "Fraktion BÜNDNIS 90/DIE GRÜNEN": "Grünen",
    "Fraktion der Freien Demokratischen Partei": "FDP",
    "Gruppe Die Linke": "Die LINKE",
    "Fraktionslos": "Fraktionslos",
    "Gruppe BSW - Bündnis Sahra Wagenknecht - Vernunft und Gerechtigkeit": "BSW"
}

faction_renaming_2025 = {
    "Fraktion der Christlich Demokratischen Union/Christlich - Sozialen Union": "CDU/CSU",
    "Fraktion Alternative für Deutschland": "AfD",
    "Fraktion der Sozialdemokratischen Partei Deutschlands": "SPD",
    "Fraktion BÜNDNIS 90/DIE GRÜNEN": "Grünen",
    "Fraktion Die Linke": "Die LINKE",
    "Fraktionslos": "Fraktionslos"
}

# apply these replacements
mps_df_2021["faction"] = mps_df_2021["faction"].replace(faction_renaming_2021)
mps_df_2025["faction"] = mps_df_2025["faction"].replace(faction_renaming_2025)

In [63]:
# create a client instance
client = Client()

# get the app password
with open("app_password.txt", "r") as f:
    app_password = f.read()

handle = "mxwlnd.bsky.social"

# login with my credentials
client.login(handle, app_password)

ProfileViewDetailed(did='did:plc:5sqqg66p7muc7ogbp6xx4sw6', handle='mxwlnd.bsky.social', associated=ProfileAssociated(chat=None, feedgens=0, labeler=False, lists=0, starter_packs=0, py_type='app.bsky.actor.defs#profileAssociated'), avatar='https://cdn.bsky.app/img/avatar/plain/did:plc:5sqqg66p7muc7ogbp6xx4sw6/bafkreigwrjedzb7jvmowkn6fbe2atbnlwecsa4ouk5wpz54eg6rqkvayrq@jpeg', banner=None, created_at='2025-05-19T19:28:35.738Z', description=None, display_name='', followers_count=2, follows_count=1, indexed_at='2025-05-19T19:28:35.738Z', joined_via_starter_pack=None, labels=[], pinned_post=None, posts_count=0, verification=None, viewer=ViewerState(blocked_by=False, blocking=None, blocking_by_list=None, followed_by=None, following=None, known_followers=None, muted=False, muted_by_list=None, py_type='app.bsky.actor.defs#viewerState'), py_type='app.bsky.actor.defs#profileViewDetailed')

In [64]:
# define a function that gives me potential bsky handle
def find_first_handle(name):
    response = client.app.bsky.actor.search_actors({'term': name})
    actors = response["actors"]
    if len(actors) >= 1:
        handle = actors[0]["handle"]
    else:
        handle = ""
    return handle

# apply this to all names of MPs and store result in new column
mps_df_2021["potential_handle"] = mps_df_2021["full_name"].apply(find_first_handle)
mps_df_2025["potential_handle"] = mps_df_2025["full_name"].apply(find_first_handle)

In [67]:
# export the dfs to csv
mps_df_2021.to_csv("../data/raw_data/bundestag_mps_2021_potential_handle.csv")
mps_df_2025.to_csv("../data/raw_data/bundestag_mps_2025_potential_handle.csv")