In [None]:
from time import sleep
from pathlib import Path

import polars as pl
from httpx import HTTPError, HTTPTransport
from swiftshadow.classes import ProxyInterface

from scrp.client import RedditScraper, RateLimitError
from scrp.model import ChildrenT3

In [None]:
PROXY_MANAGER = ProxyInterface(protocol="http", autoRotate=True, autoUpdate=False)
await PROXY_MANAGER.async_update()

def get_scraper():
    proxy = PROXY_MANAGER.get()
    print(f"using new proxy '{proxy.as_string()}'")

    return RedditScraper(
        mounts={
            f"{proxy.protocol}://": HTTPTransport(
                proxy=proxy.as_string(),
            ),
        }
    )

In [None]:
def on_fail(attempt: int, delay: int | None = None) -> RedditScraper | None:
    if delay is None:
        delay = 60

    if attempt > 5:
        return None

    sleep(delay)

    return get_scraper()


def scrape_posts(term: str, posts_in_hundreds: int = 500) -> pl.DataFrame:
    scraper = get_scraper()

    retry_attempts = 0
    after, df = None, None

    for _ in range(posts_in_hundreds):
        try:
            search = scraper.search(term, limit=100, after=None, show="all")
        except RateLimitError as e:
            scraper = on_fail(retry_attempts, delay=e.retry_after)
            retry_attempts += 1
            print(f"getting rate limit errors after {retry_attempts} retries.")
            if scraper is None:
                break
            continue
        except HTTPError:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            print(f"getting errors after {retry_attempts} retries.")
            if scraper is None:
                break
            continue

        data = [
            children.data
            for children in search.data.children
            if isinstance(children, ChildrenT3)
        ]

        if len(data) == 0:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            print(f"stopped receiving data after {retry_attempts} retries.")
            if scraper is None:
                break
            continue

        retry_attempts = 0

        if df is None:
            df = pl.DataFrame(data)
        else:
            df = df.vstack(pl.DataFrame(data))

        print(f"scraped {len(data)} ({df.height}) rows for term '{term}'.")

        after = search.data.after
        if after is None:
            print("no more data.")
            break

    if df is None:
        raise RuntimeError("no data was scraped.")

    print(f"finished scraping {df.height} posts for term '{term}'.")

    return df

In [None]:
SEARCH_TERMS = [
    "Asylansøgere",
    "Flygtninge",
    "Indvandrere",
    "Migranter",
    "Udlændinge",
]

for term in SEARCH_TERMS:
    df = scrape_posts(term, posts_in_hundreds=500)
    df = df.with_columns(search_term=pl.lit(term))

    path = Path(f"output/{term}_{df.height}.parquet".lower())

    if not path.exists():
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as f:
            df.write_parquet(path)
    else:
        print("failed to write parquet, file already exists.")