# Daten aus BlueSky

BlueSky erlaubt Zugriffe auf seinen Inhalte durch API.

Wie die [Dokumentation](https://docs.bsky.app/) zu sehen, gibt es dafür eine praktische Library "[atproto](https://pypi.org/project/atproto/)", durch die man die Authentifizierung und die damit verbundene Session verwalten kann. 
Hier verwenden wir jedoch die schlichte Library "requests" statt "atproto", damit wir den Authentifizierungsprozess und dann den Prozess des API-Requests sehen können.


## Posts suchen

Wir verwenden die Funktion Post-Suche. 

>This endpoint is part of the Bluesky application Lexicon APIs (app.bsky.*). Public endpoints which don't require authentication can be >made directly against the public Bluesky AppView API: https://public.api.bsky.app. Authenticated requests are usually made to the user's >PDS, with automatic service proxying. Authenticated requests can be used for both public and non-public endpoints.

https://docs.bsky.app/docs/api/app-bsky-feed-search-posts





In [16]:
from dotenv import load_dotenv
import os, time, json, datetime as dt
import requests, pandas as pd

In [17]:
load_dotenv()

True

In [18]:
HANDLE = os.getenv("BSKY_HANDLE")
APP_PW = os.getenv("BSKY_APP_PASSWORD")

APPVIEW = "https://public.api.bsky.app"


In [19]:
def resolve_handle_to_did(handle: str) -> str:
    r = requests.get(f"{APPVIEW}/xrpc/com.atproto.identity.resolveHandle",
                     params={"handle": handle}, timeout=20)
    r.raise_for_status()
    return r.json()["did"]

def get_pds_from_did(did: str) -> str:
    r = requests.get(f"https://plc.directory/{did}", timeout=20)
    r.raise_for_status()
    doc = r.json()
    for s in doc.get("service", []):
        if s.get("id","").endswith("#atproto_pds"):
            return s["serviceEndpoint"].rstrip("/")
    raise RuntimeError("PDS endpoint not found in DID doc")

def create_session(pds: str, identifier: str, app_pw: str) -> str:
    r = requests.post(f"{pds}/xrpc/com.atproto.server.createSession",
                      json={"identifier": identifier, "password": app_pw},
                      timeout=30)
    print("createSession:", r.status_code, r.text[:200])
    r.raise_for_status()
    return r.json()["accessJwt"]

def search_posts(q: str, limit=1000, access=None, pds=None):
    headers = {"Authorization": f"Bearer {access}"} if access else {}
    out, cursor = [], None
    while len(out) < limit:
        params = {"q": q, "limit": 100}
        if cursor: params["cursor"] = cursor
        
        url = f"{pds}/xrpc/app.bsky.feed.searchPosts" if pds else f"{APPVIEW}/xrpc/app.bsky.feed.searchPosts"
        r = requests.get(url, headers=headers, params=params, timeout=30)
        if r.status_code == 403 and access:
            r = requests.get(f"{APPVIEW}/xrpc/app.bsky.feed.searchPosts",
                             headers=headers, params=params, timeout=30)
        if r.status_code == 429:
            time.sleep(1.5); continue
        r.raise_for_status()
        data = r.json()
        posts = data.get("posts", [])
        out.extend(posts)
        cursor = data.get("cursor")
        if not cursor or not posts: break
        time.sleep(0.3)
    return out[:limit]

In [20]:
did = resolve_handle_to_did(HANDLE)          
pds = get_pds_from_did(did)                  
access = create_session(pds, HANDLE, APP_PW) 
rows = search_posts("Rechtsextremismus",
                    limit=10000, access=access, pds=pds)
print("Fetched:", len(rows))

createSession: 200 {"did":"did:plc:z4gbcpr5mbkblvaybyj7lb4o","didDoc":{"@context":["https://www.w3.org/ns/did/v1","https://w3id.org/security/multikey/v1","https://w3id.org/security/suites/secp256k1-2019/v1"],"id":"did:p
Fetched: 9709


In [21]:
def create_dataframe(posts, since=None, until=None):

    seen = set()
    out = []

    def within_window(rec_created_at: str):
        if not (since or until):
            return True
        t = dt.datetime.fromisoformat(rec_created_at.replace("Z","+00:00"))
        ok = True
        if since: ok &= (t >= since)
        if until: ok &= (t <  until)
        return ok

    for p in posts:
        uri = p.get("uri")
        if uri in seen: continue
        rec = p.get("record", {})
        if not within_window(rec.get("createdAt", "1970-01-01T00:00:00Z")):
            continue
        out.append({
            "uri": uri,
            "cid": p.get("cid"),
            "author": p.get("author", {}).get("handle"),
            "text": rec.get("text", ""),
            "createdAt": rec.get("createdAt"),
            "reply_parent": (rec.get("reply",{}) or {}).get("parent",{}).get("uri"),
            "langs": rec.get("langs"),
            "labels": [l.get("val") for l in (p.get("labels") or [])],
            "likeCount": (p.get("likeCount") or 0),
            "repostCount": (p.get("repostCount") or 0),
            "quoteCount": (p.get("quoteCount") or 0),
            "indexedAt": p.get("indexedAt")
        })

        seen.add(uri)
        
    return pd.DataFrame(out)

In [26]:
until = dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
since = until - dt.timedelta(days=150)
df = create_dataframe(rows, since, until)

In [None]:
df.head()

In [28]:
df.to_csv("test_data.csv")