In [None]:
from pathlib import Path
from time import sleep
from typing import Iterable

import polars as pl
from httpx import HTTPError, HTTPTransport
from swiftshadow.classes import ProxyInterface

from scrp.client import RateLimitError, RedditScraper
from scrp.model import ChildrenT1

In [None]:
PROXY_MANAGER = ProxyInterface(protocol="http", autoRotate=True, autoUpdate=False)
await PROXY_MANAGER.async_update()

def get_scraper():
    proxy = PROXY_MANAGER.get()
    print(f"using new proxy '{proxy.as_string()}'")

    return RedditScraper(
        mounts={
            f"{proxy.protocol}://": HTTPTransport(
                proxy=proxy.as_string(),
            ),
        }
    )

In [None]:
def on_fail(attempt: int, delay: int | None = None) -> RedditScraper | None:
    if delay is None:
        delay = 60

    if attempt > 5:
        return None

    sleep(delay)

    return get_scraper()


def scrape_comments(permalinks: Iterable[str]) -> pl.DataFrame:
    scraper = get_scraper()

    retry_attempts = 0
    after, df = None, None

    for permalink in permalinks:
        try:
            search = scraper.comments(permalink, limit=10)
        except RateLimitError as e:
            scraper = on_fail(retry_attempts, delay=e.retry_after)
            retry_attempts += 1
            if scraper is None:
                print("still getting rate limit errors after 5 retries.")
                break
            continue
        except HTTPError:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            if scraper is None:
                print("still getting errors after 5 retries.")
                break
            continue

        comments = search[1]
        data = [
            children.data
            for children in comments.data.children
            if isinstance(children, ChildrenT1)
        ]

        if len(data) == 0:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            if scraper is None:
                print("stopped receiving data after 5 retries.")
                break
            continue

        retry_attempts = 0

        if df is None:
            df = pl.DataFrame(data)
        else:
            df = df.vstack(pl.DataFrame(data))

        print(f"scraped {len(data)} ({df.height}) rows for permalink '{permalink}'.")

        after = comments.data.after
        if after is None:
            print("no more data.")
            break
    
    if df is None:
        raise RuntimeError("no data was scraped.")

    print(f"finished scraping {df.height} posts for term '{permalink}'.")

    return df

In [None]:
file = "python_1000"

path = Path(f"output/{file}.parquet".lower())

if path.exists():
    df = pl.read_parquet(path)
else:
    print("failed to read parquet, file does not exist.")
    exit()

In [None]:
permalinks = df.select("permalink").to_series()
df = scrape_comments(permalinks)

path = Path(f"output/{file}_comments.parquet".lower())

if not path.exists():
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        df.write_parquet(path)
else:
    print("failed to write parquet, file already exists.")