In [1]:
import csv
import re
import time
from pathlib import Path
from urllib.parse import urlparse, urlunparse

import requests
from bs4 import BeautifulSoup

In [2]:
INPUT_CSV = "kpo_project_links.csv"
OUTPUT_CSV = "kpo_projects_details_dirty.csv"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (compatible; KPO-scraper/0.3;)"
    )
}

MONEY_RE = re.compile(r"[^\d,.-]")
SPACE_RE = re.compile(r"\s+")

In [3]:
def normalize_url(u: str) -> str:
    """Usuń fragmenty i query; dodaj końcowy '/' dla spójności."""
    p = urlparse(u.strip())
    cleaned = urlunparse((p.scheme, p.netloc, p.path, "", "", ""))
    if not cleaned.endswith("/"):
        cleaned += "/"
    return cleaned

In [4]:
def t(text: str | None) -> str:
    """Proste czyszczenie tekstu: spacje, \xa0, nowe linie."""
    if not text:
        return ""
    s = text.replace("\xa0", " ")
    s = SPACE_RE.sub(" ", s)
    return s.strip()

In [5]:
def parse_money_pl(s: str | None) -> float | None:
    """
    Zamień '2 825 534 826,90 zł' -> 2825534826.90
    Zwraca float lub None, jeśli nie rozpoznano.
    """
    if not s:
        return None
    s = t(s)
    s = MONEY_RE.sub("", s)
    if s.count(",") == 1 and s.rfind(",") > s.rfind("-"):
        s = s.replace(".", "")
        s = s.replace(",", ".")
    try:
        return float(s)
    except ValueError:
        return None

In [6]:
def extract_pairs_from_info_item(item_div) -> list[tuple[str, str]]:
    """
    W sekcji 'single-project-info-item' są powtarzalne pary:
    <div class="...-text-up">etykieta</div>
    <div class="...-text-down">wartość</div>
    (czasem z dodatkowymi klasami 'big', 'blue' itp.)
    Zwraca listę (label, value).
    """
    pairs = []
    ups = item_div.select(".single-project-info-item-text-up")
    downs = item_div.select(".single-project-info-item-text-down")
    if ups and downs:
        n = min(len(ups), len(downs))
        for i in range(n):
            pairs.append((t(ups[i].get_text()), t(downs[i].get_text())))
    else:
        for up in ups:
            label = t(up.get_text())
            down = up.find_next(class_="single-project-info-item-text-down")
            value = t(down.get_text()) if down else ""
            pairs.append((label, value))
    return pairs

In [7]:
def parse_project_page(html: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")

    title = ""
    h2 = soup.select_one(".single-project-title h2.big-title")
    if h2:
        title = t(h2.get_text())

    categories = []
    for span in soup.select(".single-project-cat .categories span"):
        val = t(span.get_text())
        if val:
            categories.append(val)
    category = ", ".join(categories) if categories else ""

    data = {}
    for item in soup.select(".single-project-info-item"):
        for label, value in extract_pairs_from_info_item(item):
            if label and value:
                data[label.lower()] = value

    beneficiary = data.get("nazwa beneficjenta", "")
    project_value_raw = data.get("wartość projektu", "")
    eu_funding_raw = data.get("dofinansowanie z ue", "")
    woj = data.get("województwo", "")
    powiat = data.get("powiat", "")
    program = data.get("program", "")
    dzialanie = data.get("działanie", "") or data.get("dzialanie", "")
    fundusz = data.get("fundusz", "")
    perspektywa = data.get("perspektywa", "")

    project_value = parse_money_pl(project_value_raw)
    eu_funding = parse_money_pl(eu_funding_raw)

    return {
        "title": title,
        "category": category,
        "beneficiary": beneficiary,
        "project_value_raw": project_value_raw,
        "project_value": project_value,
        "eu_funding_raw": eu_funding_raw,
        "eu_funding": eu_funding,
        "wojewodztwo": woj,
        "powiat": powiat,
        "program": program,
        "dzialanie": dzialanie,
        "fundusz": fundusz,
        "perspektywa": perspektywa,
    }

In [8]:
def iter_input_links(path: str):
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            url = normalize_url(row["url"])
            yield row.get("project_id", "").strip(), url

In [9]:
def load_already_done(path: str) -> set[str]:
    """Wczytaj już zeskrapowane project_id z OUTPUT_CSV, jeśli istnieje."""
    done = set()
    if Path(path).exists():
        with open(path, "r", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            for row in reader:
                if "project_id" in row:
                    done.add(row["project_id"])
    return done

In [10]:
def main():
    done_ids = load_already_done(OUTPUT_CSV)
    print(f"Znaleziono {len(done_ids)} już zapisanych projektów - pomijam je.")

    session = requests.Session()
    session.headers.update(HEADERS)

    write_header = not Path(OUTPUT_CSV).exists()

    with open(OUTPUT_CSV, "a", newline="", encoding="utf-8-sig") as out:
        writer = csv.writer(out)
        if write_header:
            writer.writerow([
                "project_id","url","title","category","beneficiary",
                "project_value_raw","project_value",
                "eu_funding_raw","eu_funding",
                "wojewodztwo","powiat","program",
                "dzialanie","fundusz","perspektywa"
            ])

        for idx, (project_id, url) in enumerate(iter_input_links(INPUT_CSV), start=1):
            if project_id in done_ids:
                continue  # nie sciaga ponownie juz istniejacych

            try:
                resp = session.get(url, timeout=20)
            except requests.RequestException as e:
                print(f"[{idx}] {project_id} – błąd sieci: {e}")
                continue
            if resp.status_code != 200:
                print(f"[{idx}] {project_id} – HTTP {resp.status_code}")
                continue

            details = parse_project_page(resp.text)
            writer.writerow([
                project_id,url,details["title"],details["category"],
                details["beneficiary"],details["project_value_raw"],
                details["project_value"] if details["project_value"] is not None else "",
                details["eu_funding_raw"],
                details["eu_funding"] if details["eu_funding"] is not None else "",
                details["wojewodztwo"],details["powiat"],details["program"],
                details["dzialanie"],details["fundusz"],details["perspektywa"]
            ])
            out.flush()
            print(f"[{idx}] OK: {project_id}")
            time.sleep(0.6)

    print("Gotowe!")

if __name__ == "__main__":
    main()

Znaleziono 0 już zapisanych projektów - pomijam je.
[1] OK: 1709965
[2] OK: 1691583
[3] OK: 1757648
[4] OK: 1735570
[5] OK: 1736517
[6] OK: 1681147
[7] OK: 1670468
[8] OK: 1670198
[9] OK: 1746793
[10] OK: 1670486
[11] OK: 1699531
[12] OK: 1681140
[13] OK: 1670448
[14] OK: 1757692
[15] OK: 1670472
[16] OK: 1674610
[17] OK: 1736502
[18] OK: 1746326
[19] OK: 1670466
[20] OK: 1670492
[21] OK: 1757687
[22] OK: 1670470
[23] OK: 1736181
[24] OK: 1670460
[25] OK: 1709911
[26] OK: 1670196
[27] OK: 1670462
[28] OK: 1757689
[29] OK: 1721468
[30] OK: 1705696
[31] OK: 1757630
[32] OK: 1670456
[33] OK: 1674614
[34] OK: 1687061
[35] OK: 1670452
[36] OK: 1691577
[37] OK: 1705563
[38] OK: 1757623
[39] OK: 1695245
[40] OK: 1736174
[41] OK: 1677664
[42] OK: 1736506
[43] OK: 1757381
[44] OK: 1691575
[45] OK: 1681122
[46] OK: 1681118
[47] OK: 1735346
[48] OK: 1691573
[49] OK: 1757626
[50] OK: 1674612
[51] OK: 1681120
[52] OK: 1681124
[53] OK: 1684445
[54] OK: 1695302
[55] OK: 1670446
[56] OK: 1670488
[57] 