In [None]:
from pathlib import Path
from time import sleep

import polars as pl
from httpx import HTTPError, HTTPTransport
from swiftshadow.classes import ProxyInterface
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from scrp.client import RedditScraper
from scrp.model import ChildrenT3, RedditListing

In [None]:
PROXY_MANAGER = ProxyInterface(protocol="http", autoRotate=True, autoUpdate=False)
await PROXY_MANAGER.async_update()


def get_scraper():
    proxy = PROXY_MANAGER.get()
    print(f"using new proxy '{proxy.as_string()}'")

    return RedditScraper(
        mounts={
            f"{proxy.protocol}://": HTTPTransport(
                proxy=proxy.as_string(),
            ),
        }
    )

In [None]:
def on_fail(attempt: int, reason: str) -> RedditScraper:
    if attempt > 5:
        raise RuntimeError(f"Failed to scrape comments; {reason}")
    print(f"Getting new scraper; {reason}")
    return get_scraper()


@retry(
    stop=stop_after_attempt(2),
    wait=wait_exponential(multiplier=4),
    retry=retry_if_exception_type(HTTPError),
    before_sleep=lambda _: print("Failed to scrape comments; retrying..."),
    reraise=True,
)
def try_scrape_comments(scraper: RedditScraper, term: str) -> RedditListing | None:
    try:
        return scraper.search(term, limit=100, after=None, show="all")
    except HTTPError:
        return None


def scrape(term: str, posts_in_hundreds: int = 500) -> pl.DataFrame:
    scraper = get_scraper()

    retry_attempts = 0
    after, df = None, None

    while posts_in_hundreds & retry_attempts < 5:
        response = try_scrape_comments(scraper, term)
        if response is None:
            scraper = on_fail(retry_attempts, "HTTPError")
            retry_attempts += 1
            sleep(30 * retry_attempts)
            continue

        data = [
            child.data
            for child in response.data.children
            if isinstance(child, ChildrenT3)
        ]

        if len(data) == 0:
            scraper = on_fail(retry_attempts, "API returned no data")
            retry_attempts += 1
            sleep(30 * retry_attempts)
            continue

        if df is None:
            df = pl.DataFrame(data)
        else:
            df = pl.DataFrame(data).vstack(df)

        print(f"Scraped {len(data)} ({df.height}) rows for term '{term}'.")

        after = response.data.after
        if after is None:
            print("Pagination stopped.")
            break

        posts_in_hundreds -= 1

    if df is None:
        raise RuntimeError("No data was scraped.")

    if retry_attempts >= 5:
        print("Failed to scrape comments; exited early...")

    print(f"finished scraping {df.height} posts for term '{term}'.")

    return df

In [None]:
SEARCH_TERMS = [
    "Anden etnicitet",
    "Asylansøgere",
    "Asylpolitik",
    "Blackface i Danmark",
    "Flygtninge",
    "Ghettoloven",
    "Hyggeracisme",
    "Indvandrere",
    "Islam i Danmark",
    "Islamisme",
    "Migranter",
    "Migrantkrise",
    "Udlændigepolitik",
    "Udlændinge",
    "Ulovlig invandring",
    "Parallelsamfund",
    "Perkere",
    "Racisme",
    "Racismeparagraffen",
    "Statsborgerskab",
    "Strukturel racisme",
]

for term in SEARCH_TERMS:
    df = scrape(term, posts_in_hundreds=50)
    df = df.with_columns(search_term=pl.lit(term))

    path = Path(f"output/{term}_{df.height}.parquet".lower().replace(" ", "_"))

    if not path.exists():
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as f:
            df.write_parquet(path)
    else:
        print("failed to write parquet, file already exists.")