In [None]:
from time import sleep
from pathlib import Path

import polars as pl
from httpx import HTTPError, HTTPTransport
from swiftshadow.classes import ProxyInterface

from scrp.client import RedditScraper
from scrp.model import ChildrenT3

ModuleNotFoundError: No module named 'scrp'

In [2]:
SEARCH_TERMS = [
    "Asylansøgere",
    "Flygtninge",
    "Indvandrere",
    "Migranter",
    "Udlændinge",
]

PROXY_MANAGER = ProxyInterface(protocol="http", autoRotate=True, autoUpdate=False)
await PROXY_MANAGER.async_update()

2025-02-24 17:42:31,322 - swiftshadow [INFO]:Cache Expired


In [3]:
def get_scraper():
    proxy = PROXY_MANAGER.get()
    print(f'Using new proxy "{proxy.as_string()}"')

    return RedditScraper(
        mounts={
            f"{proxy.protocol}://": HTTPTransport(
                proxy=proxy.as_string(),
                retries=5,
            ),
        }
    )

In [4]:
def on_fail(attempt: int) -> RedditScraper | None:
    if attempt > 5:
        return None
    sleep(65)
    return get_scraper()


def search_term(term: str) -> pl.DataFrame:
    scraper = get_scraper()

    retry_attempts = 0
    after, df = None, None

    for _ in range(1000):
        try:
            search = scraper.search(term, limit=100, after=None, show="all")
            after = search.data.after
        except HTTPError:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            if scraper is None:
                print("still getting errors after 5 retries.")
                break
            continue

        data = [
            children.data
            for children in search.data.children
            if isinstance(children, ChildrenT3)
        ]

        if len(data) == 0:
            scraper = on_fail(retry_attempts)
            retry_attempts += 1
            if scraper is None:
                print("no more data after 5 retries.")
                break
            continue

        retry_attempts = 0

        if df is None:
            df = pl.DataFrame(data)
        else:
            df = df.vstack(pl.DataFrame(data))

        print(f"scraped {len(data)} ({df.height}) rows for term '{term}'.")

        if after is None:
            print("no more data.")
            break

    if df is None:
        raise RuntimeError("no data was scraped.")

    return df

In [None]:
term = SEARCH_TERMS[0]

df = search_term(term)
df = df.with_columns(search_term=pl.lit(term))

path = Path(f"output/{term}_{df.height}.parquet".lower())

if not path.exists():
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        df.write_parquet(path)
else:
    print("failed to write parquet, file already exists.")

Using new proxy "http://119.3.113.150:9094"
