In [None]:
from pathlib import Path
from time import sleep
from typing import Iterable

import polars as pl
from httpx import HTTPError, HTTPTransport
from swiftshadow.classes import ProxyInterface
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from scrp.client import RedditScraper
from scrp.model import ChildrenT1, RedditListing, RedditChildren

In [None]:
PROXY_MANAGER = ProxyInterface(protocol="http", autoRotate=True, autoUpdate=False)
await PROXY_MANAGER.async_update()


def get_scraper():
    proxy = PROXY_MANAGER.get()
    print(f"using new proxy '{proxy.as_string()}'")

    return RedditScraper(
        mounts={
            f"{proxy.protocol}://": HTTPTransport(
                proxy=proxy.as_string(),
            ),
        }
    )

In [None]:
def unnest(comments: list[RedditChildren]) -> list[ChildrenT1]:
    replies = []
    no_replies = 0

    while True:
        for children in comments:
            if not isinstance(children, ChildrenT1) or children.data.replies is None:
                no_replies += 1
                continue
            replies.append(children.data.replies)

        comments = replies
        replies = []

        if len(comments) == no_replies:
            break

    return replies


def on_fail(attempt: int, delay: int, reason: str) -> RedditScraper:
    if attempt > 5:
        raise RuntimeError(f"Failed to scrape comments; {reason}")
    sleep(delay)
    return get_scraper()


@retry(
    stop=stop_after_attempt(2),
    wait=wait_exponential(multiplier=4),
    retry=retry_if_exception_type(HTTPError),
    before_sleep=lambda _: print("Failed to scrape comments; retrying..."),
    reraise=True,
)
def try_scrape_comments(
    scraper: RedditScraper, permalink: str
) -> list[RedditListing] | None:
    try:
        return scraper.comments(permalink, limit=100, sort="controversial")
    except HTTPError:
        return None


def scrape(permalinks: Iterable[str]) -> pl.DataFrame:
    permalinks = iter(permalinks)
    permalink = next(permalinks, None)

    scraper = get_scraper()

    retry_attempts = 0

    while permalink and retry_attempts < 5:
        response = try_scrape_comments(scraper, permalink)
        if response is None:
            scraper = on_fail(retry_attempts, 60, "HTTPError")
            retry_attempts += 1
            sleep(30 * retry_attempts)
            continue

        comments = unnest(response[1].data.children)

        if len(comments) == 0:
            scraper = on_fail(retry_attempts, 60, "API returned no data")
            retry_attempts += 1
            sleep(30 * retry_attempts)
            continue

        retry_attempts = 0

        if df is None:
            df = pl.DataFrame(comments).drop("replies")
        else:
            df = pl.DataFrame(comments).drop("replies").vstack(df)

        short = permalink.split("/")[4]
        print(f"scraped {len(comments)} ({df.height}) rows for permalink '{short}'.")

    if df is None:
        raise RuntimeError("No data was scraped.")

    if retry_attempts >= 5:
        print("Failed to scrape comments; exited early...")

    print(f"Finished scraping {df.height} comments for '{permalink}'.")

    return df

In [None]:
file = "combined"
path = Path(f"output/{file}.parquet".lower())

if path.exists():
    df_posts = pl.read_parquet(path)
else:
    raise ValueError("failed to read parquet, file does not exist.")

In [None]:
permalinks = df_posts.select("permalink").to_series()
df = scrape(permalinks)

In [None]:
path = Path(f"output/{file}_comments.parquet".lower())

if not path.exists():
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        df.write_parquet(path)
else:
    print("failed to write parquet, file already exists.")