In [None]:
# !pip install apify-client aiohttp asyncio pandas python-dotenv tqdm beautifulsoup4 pyarrow

In [3]:
#!pip install nest_asyncio



In [6]:
import os
import asyncio
import logging
from logging.handlers import RotatingFileHandler

import nest_asyncio
nest_asyncio.apply()

import pandas as pd
from apify_client import ApifyClient
from bs4 import BeautifulSoup
from tqdm import tqdm


# -----------------------------
# 1. API Token
# -----------------------------
API_TOKEN = "apify_api_oyTJYoRZLi31jxi2aX9fhqTS5xhMdv0Crsz9"
ACTOR_ID = "pocesar/stepstone-scraper"   # ← бесплатный рабочий актор


# -----------------------------
# 2. Search terms
# -----------------------------
SEARCH_TERMS = [
    "Business Transformation Analyst",
    "Digital Process Analyst",
    "AI Project Manager",
    "AI Product Manager",
    "AI Governance Analyst",
    "AI Automation Specialist",
    "Prompt Engineer",
    "Junior Automation Specialist",
    "Prozessmanager/in",
    "KI-Manager/in",
    "Prozessmanager/in – RPA",
    "KI-Prompter",
    "Prozessmanager/in – RPA (Junior)"
]

FIELDS = [
    "title",
    "company",
    "location",
    "employmentType",
    "seniority",
    "url",
    "skills",
    "languages",
    "education",
    "experience",
    "description"
]


# -----------------------------
# 3. Logging
# -----------------------------
logger = logging.getLogger("stepstone_scraper")
logger.setLevel(logging.INFO)

handler = RotatingFileHandler(
    "scraper.log",
    maxBytes=2_000_000,
    backupCount=3,
    encoding="utf-8"
)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)

logger.handlers.clear()
logger.addHandler(handler)


# -----------------------------
# 4. HTML cleaner
# -----------------------------
def clean_html(text):
    if not text:
        return None
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ", strip=True)


# -----------------------------
# 5. Async actor runner
# -----------------------------
client = ApifyClient(API_TOKEN)

async def run_actor_async(search_term: str):
    logger.info(f"Запуск скрапинга для: {search_term}")

    run = client.actor(ACTOR_ID).call(
        run_input={
            "query": search_term,      # ← корректный параметр
            "location": "Germany",
            "maxItems": 150
        }
    )

    dataset_id = run["defaultDatasetId"]
    dataset = client.dataset(dataset_id)
    items = dataset.list_items().items

    logger.info(f"Получено {len(items)} вакансий для: {search_term}")
    return search_term, items


# -----------------------------
# 6. Async scraping
# -----------------------------
async def scrape_all_async():
    tasks = [run_actor_async(term) for term in SEARCH_TERMS]
    results = []

    for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Scraping Stepstone"):
        search_term, items = await coro

        for job in items:
            row = {field: job.get(field) for field in FIELDS}
            row["description_clean"] = clean_html(job.get("description"))
            row["search_term"] = search_term
            results.append(row)

    return pd.DataFrame(results)


# -----------------------------
# 7. Save results
# -----------------------------
def save_results(df):
    df.to_csv("results.csv", index=False, encoding="utf-8")
    df.to_parquet("results.parquet", index=False)
    logger.info(f"Сохранено {len(df)} строк")


# -----------------------------
# 8. Entry point
# -----------------------------
if __name__ == "__main__":
    logger.info("Старт асинхронного скрапинга Stepstone")

    df = asyncio.run(scrape_all_async())

    print(df.head())
    print(f"Всего строк: {len(df)}")

    save_results(df)

    logger.info("Готово!")


Scraping Stepstone:   0%|                                                                       | 0/13 [00:02<?, ?it/s]
  for group in groupby(strings, lambda s: s[0] == first[0])) \
  for group in groupby(strings, lambda s: s[0] == first[0])) \


ApifyApiError: Actor with this name was not found