In [9]:
# install the requirements
#!pip install atproto
#!pip install pandas
#!pip install nltk==3.8.1

Collecting nltk==3.8.1
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [11]:
# load requirements
from atproto import Client
import pandas as pd
import nltk
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import re

In [2]:
# create a client instance
client = Client()

# get the app password
with open("app_password.txt", "r") as f:
    app_password = f.read()

handle = "mxwlnd.bsky.social"

# login with my credentials
client.login(handle, app_password)

ProfileViewDetailed(did='did:plc:5sqqg66p7muc7ogbp6xx4sw6', handle='mxwlnd.bsky.social', associated=ProfileAssociated(chat=None, feedgens=0, labeler=False, lists=0, starter_packs=0, py_type='app.bsky.actor.defs#profileAssociated'), avatar='https://cdn.bsky.app/img/avatar/plain/did:plc:5sqqg66p7muc7ogbp6xx4sw6/bafkreigwrjedzb7jvmowkn6fbe2atbnlwecsa4ouk5wpz54eg6rqkvayrq@jpeg', banner=None, created_at='2025-05-19T19:28:35.738Z', description=None, display_name='', followers_count=2, follows_count=1, indexed_at='2025-05-19T19:28:35.738Z', joined_via_starter_pack=None, labels=[], pinned_post=None, posts_count=0, verification=None, viewer=ViewerState(blocked_by=False, blocking=None, blocking_by_list=None, followed_by=None, following=None, known_followers=None, muted=False, muted_by_list=None, py_type='app.bsky.actor.defs#viewerState'), py_type='app.bsky.actor.defs#profileViewDetailed')

In [3]:
# get the did of the list creator
creator_handle = "politicshome.bsky.social"
creator_profile = client.app.bsky.actor.get_profile({"actor": creator_handle})
creator_did = creator_profile.did
rkey = '3laetww5nlb23'             # the unique ID of the list

# get all members on the list
handles = []
keep_looping = True
cursor = None
while keep_looping == True:
    list_items = client.app.bsky.graph.get_list({
        'list': f'at://{creator_did}/app.bsky.graph.list/{rkey}',
        'cursor': cursor
    })
    for item in list_items.items:
        did = item.subject.did
        profile = client.app.bsky.actor.get_profile({'actor': did})
        handles.append(profile.handle)
    cursor = list_items["cursor"]
    if not cursor:
        keep_looping = False

In [5]:
# define a function that returns a df with all posts from given handle

def retrieve_posts_to_df(list_handles):
    data = []
    for handle in list_handles:
        did = client.com.atproto.identity.resolve_handle({'handle': handle})['did']
        cursor = None
        keep_looping = True

        while keep_looping == True:
            response = client.app.bsky.feed.get_author_feed({
                'actor': did,
                'cursor': cursor,
                'limit': 100
            })

            # get the response feed
            feed = response['feed']

            for item in feed:
                post = {}
                post_handle = item["post"]["author"]["handle"]
                text = item["post"]["record"]["text"]
                date = item["post"]["record"]["created_at"]

                # filter out reposts
                if handle == post_handle:
                    post["handle"] = handle
                    post["date"] = date
                    post["text"] = text
                    data.append(post)

            cursor = response['cursor']
            if not cursor:
                keep_looping = False

        df = pd.DataFrame(data).sort_values(by="date", ascending=False)
    
    return df

# retrieve the posts
df = retrieve_posts_to_df(handles)

In [30]:
# define emoji patterns
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & pictographs
    "\U0001F680-\U0001F6FF"  # Transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # Flags
    "\U00002700-\U000027BF"  # Dingbats
    "\U0001F900-\U0001F9FF"  # Supplemental symbols
    "\U00002600-\U000026FF"  # Misc symbols
    "\U00002500-\U00002BEF"  # Chinese characters and more
    "]+",
    flags=re.UNICODE,
)
hashtag_pattern = re.compile(r"#\w+")
url_pattern = re.compile(r"http[s]?://\S+|www\.\S+")

# define function that removes the emojis
def clean_text(text):
    if isinstance(text, str):
        text = emoji_pattern.sub(r'', text)
        text = hashtag_pattern.sub(r'', text)
        text = url_pattern.sub(r'', text)
        return text.strip()
    return text

# apply it to the df
df["text"] = df["text"].apply(clean_text)

# drop missing or non-string text rows
df = df[df["text"].apply(lambda x: isinstance(x, str))]


In [31]:
# split the posts into individual sentences and sample 250 from them
all_sentences = [sentence for post in df["text"] for sentence in sent_tokenize(post, language="english")]

# pull a random sample and convert it to a df
annotation_sample = random.sample(all_sentences, 250)
annotation_df = pd.DataFrame({"sentence_id": range(len(annotation_sample)),
                              "sentence": annotation_sample,
                              "label": ""})

In [32]:
for idx, row in annotation_df.iloc[1:20].iterrows():
    print(row["sentence"])

Since Labour took office, 10,000 more children have been plunged into poverty by the refusal to scrap the two-child benefit cap.
I'm looking for a Senior Caseworker to join my team based in Sheerness.
I get messages about this from across the city, including where people have been hurt.
This , as always, we celebrate your contribution.
I marked  at Cancer Research UK’s parliamentary drop in, speaking to the charity about new diagnostic and treatment technologies.
This , I hope every child has the chance to get to ‘know themselves to grow themselves’.
A useful resource to find out about roadworks is here.
We'll hear from planners, the council and police leaders as well as women themselves.
It was the start of a strong working relationship between both Speakers representing their respective Houses.
I was reassured today to hear the government reiterate its commitment to legislation that will protect leaseholders and ensure they get fair treatment.
Action, not delay.
We need sanctions on 

In [33]:
# export the full df and the annotations to the data folder
df.to_csv("../data/british_mps_posts.csv", index=False)
annotation_df.to_csv("../data/sentences_for_annotation.csv", index=False)