In [99]:
# install the requirements (if not already done)
# !pip install atproto
# !pip install pandas
# !pip install emoji

In [100]:
# load requirements
from atproto import Client
import pandas as pd
from datetime import datetime
import re
import emoji

In [83]:
# load the datasets containing MP data
bundestag_21 = pd.read_csv("../data/clean_data/bundestag_21_clean.csv")
bundestag_25 = pd.read_csv("../data/clean_data/bundestag_25_clean.csv")

In [84]:
# for each dataset get only the MPs with existing bsky handles
bundestag_21.dropna(subset=["clean_handle"], inplace=True)
bundestag_25.dropna(subset=["clean_handle"], inplace=True)

In [21]:
# create a client instance
client = Client()

# get the app password
with open("app_password.txt", "r") as f:
    app_password = f.read()

handle = "mxwlnd.bsky.social"

# login with my credentials
client.login(handle, app_password)

ProfileViewDetailed(did='did:plc:5sqqg66p7muc7ogbp6xx4sw6', handle='mxwlnd.bsky.social', associated=ProfileAssociated(chat=None, feedgens=0, labeler=False, lists=0, starter_packs=0, py_type='app.bsky.actor.defs#profileAssociated'), avatar='https://cdn.bsky.app/img/avatar/plain/did:plc:5sqqg66p7muc7ogbp6xx4sw6/bafkreigwrjedzb7jvmowkn6fbe2atbnlwecsa4ouk5wpz54eg6rqkvayrq@jpeg', banner=None, created_at='2025-05-19T19:28:35.738Z', description=None, display_name='', followers_count=2, follows_count=1, indexed_at='2025-05-19T19:28:35.738Z', joined_via_starter_pack=None, labels=[], pinned_post=None, posts_count=0, verification=None, viewer=ViewerState(blocked_by=False, blocking=None, blocking_by_list=None, followed_by=None, following=None, known_followers=None, muted=False, muted_by_list=None, py_type='app.bsky.actor.defs#viewerState'), py_type='app.bsky.actor.defs#profileViewDetailed')

In [64]:
# define a function that returns a df with all posts from given handle
def clean_handle(raw_handle):
    return re.sub(r'[\u200b-\u200f\u202a-\u202e\u2060-\u206f\ufeff]', '', raw_handle.strip().lower())

def retrieve_posts(df):
    all_data = []
    for _, row in df.iterrows():
        handle = clean_handle(row["clean_handle"])
        faction_start = pd.to_datetime(row["faction_start"])
        faction_end = pd.to_datetime(row["faction_end"])

        if faction_start >= datetime(2024, 6, 1):
            start_period = faction_start
        else:
            start_period = datetime(2024, 6, 1)
        

        if faction_end == datetime(2025, 3, 25):
            end_period = datetime(2025, 3, 24)
        elif pd.isnull(row["faction_end"]):
            end_period = datetime(2025, 6, 25)
        else:
            end_period = faction_end

        try:
            did = client.com.atproto.identity.resolve_handle({'handle': handle})['did']
        except Exception as e:
            print(f"Failed to resolve handle {handle}: {e}")
            continue
        
        cursor = None
        
        keep_looping = True

        while keep_looping == True:
            response = client.app.bsky.feed.get_author_feed({
                'actor': did,
                'cursor': cursor,
                'limit': 100
            })

            # get the response feed
            feed = response['feed']

            for item in feed:
                post_handle = item["post"]["author"]["handle"]
                text = item["post"]["record"]["text"]
                date = item["post"]["record"]["created_at"]
                date = pd.to_datetime(date).tz_localize(None)

                # filter out reposts
                if handle == post_handle and start_period <= date <= end_period:
                    post_data = row.to_dict()
                    post_data.update({
                        "text": text,
                        "date": date
                    })
                    all_data.append(post_data)
            cursor = response['cursor']
            if not cursor:
                keep_looping = False

    df = pd.DataFrame(all_data).sort_values(by="date", ascending=True)

    return df


In [65]:
# retrieve the posts for both legislatures
posts_bt_21 = retrieve_posts(bundestag_21)
posts_bt_25 = retrieve_posts(bundestag_25)

In [107]:
# combine both dfs to one
posts_bt_combined = pd.concat([posts_bt_21, posts_bt_25])

# apply basic text cleaning
def clean_text(text):
    text = text.lower()
    re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = emoji.replace_emoji(text, replace="")
    return text

posts_bt_combined["text"] = posts_bt_combined["text"].apply(clean_text)

In [110]:
# export the dataset as a csv
posts_bt_combined.to_csv("../data/clean_data/all_posts_cleaned.csv", index=False)