In [1]:
import requests
from bs4 import BeautifulSoup
import os
import csv
import time

BASE_URL = "https://www.cancer.gov"
START_URL = f"{BASE_URL}/types"
CSV_FILE = "pdq_index_requests.csv"
TEXT_FOLDER = "pdq_texts_requests"

os.makedirs(TEXT_FOLDER, exist_ok=True)

headers = {
    "User-Agent": "Mozilla/5.0"
}

# Получаем главную страницу /types
resp = requests.get(START_URL, headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")

# Получаем все уникальные ссылки на типы рака
option_tags = soup.select("option[data-link^='/types/']")
unique_links = []
seen = set()

for opt in option_tags:
    link = opt.get("data-link")
    if link and link not in seen:
        seen.add(link)
        unique_links.append(BASE_URL + link)

results = []

for type_url in unique_links:
    print(f"\n🔍 Обработка: {type_url}")
    type_name = type_url.rstrip("/").split("/")[-1]

    try:
        # Получаем страницу типа рака
        resp = requests.get(type_url, headers=headers)
        soup = BeautifulSoup(resp.content, "html.parser")

        # Ищем ссылку на HP-версию
        hp_links = soup.select("a[href*='/hp']")
        hp_url = None
        for a in hp_links:
            href = a.get("href", "")
            if "/types/" in href and "/hp" in href:
                hp_url = href if href.startswith("http") else BASE_URL + href
                break

        if not hp_url:
            print("❌ HP-версия не найдена")
            continue

        print(f"🩺 Найдена HP-версия: {hp_url}")
        time.sleep(0.5)

        # Загружаем HP-страницу
        hp_resp = requests.get(hp_url, headers=headers)
        hp_soup = BeautifulSoup(hp_resp.content, "html.parser")

        # Ищем все PDQ-документы
        pdq_links = hp_soup.select("a[href*='/hp/'][href$='-pdq']")
        print(f"📄 Найдено PDQ-документов: {len(pdq_links)}")

        for link in pdq_links:
            pdq_url = link.get("href")
            pdq_url = pdq_url if pdq_url.startswith("http") else BASE_URL + pdq_url
            pdq_title = link.get_text(strip=True)
            pdq_slug = pdq_url.split("/")[-1].replace("-pdq", "")
            text_filename = f"{type_name}__{pdq_slug}.txt"
            text_path = os.path.join(TEXT_FOLDER, text_filename)

            try:
                pdq_resp = requests.get(pdq_url, headers=headers, timeout=10)
                pdq_soup = BeautifulSoup(pdq_resp.content, "html.parser")

                # Пытаемся найти содержимое PDQ
                content_div = (
                    pdq_soup.find("div", id="pdq-body") or
                    pdq_soup.find("div", id="main-content") or
                    pdq_soup.find("div", class_="pdq-inner") or
                    pdq_soup.find("article")
                )

                if not content_div:
                    print(f"⚠️ Контент не найден на {pdq_url}")
                    continue

                text = content_div.get_text(separator="\n", strip=True)
                if not text:
                    print(f"⚠️ Пустой текст на {pdq_url}")
                    continue

                with open(text_path, "w", encoding="utf-8") as f:
                    f.write(text)

                results.append({
                    "type": type_name,
                    "title": pdq_title,
                    "url": pdq_url,
                    "file": text_filename
                })

            except Exception as e:
                print(f"❌ Ошибка на {pdq_url}: {e}")
                continue

        time.sleep(1)

    except Exception as e:
        print(f"❌ Ошибка на {type_url}: {e}")

# Сохраняем CSV
if results:
    with open(CSV_FILE, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["type", "title", "url", "file"])
        writer.writeheader()
        for row in results:
            writer.writerow(row)

print(f"\n✅ Готово! Найдено {len(results)} PDQ-документов. Сохранено в {CSV_FILE} и папке {TEXT_FOLDER}/")


🔍 Обработка: https://www.cancer.gov/types/leukemia
🩺 Найдена HP-версия: https://www.cancer.gov/types/leukemia/hp
📄 Найдено PDQ-документов: 12

🔍 Обработка: https://www.cancer.gov/types/aya
❌ HP-версия не найдена

🔍 Обработка: https://www.cancer.gov/types/adrenocortical
🩺 Найдена HP-версия: https://www.cancer.gov/types/adrenocortical/hp
📄 Найдено PDQ-документов: 2

🔍 Обработка: https://www.cancer.gov/types/lymphoma
🩺 Найдена HP-версия: https://www.cancer.gov/types/lymphoma/hp
📄 Найдено PDQ-документов: 10

🔍 Обработка: https://www.cancer.gov/types/anal
🩺 Найдена HP-версия: https://www.cancer.gov/types/anal/hp
📄 Найдено PDQ-документов: 2

🔍 Обработка: https://www.cancer.gov/types/gi-neuroendocrine-tumors
🩺 Найдена HP-версия: https://www.cancer.gov/types/gi-neuroendocrine-tumors/hp
📄 Найдено PDQ-документов: 2

🔍 Обработка: https://www.cancer.gov/types/brain
🩺 Найдена HP-версия: https://www.cancer.gov/types/brain/hp
📄 Найдено PDQ-документов: 9

🔍 Обработка: https://www.cancer.gov/types/ski