In [1]:
import asyncio
import re
import json
import fitz
import pytesseract
from datetime import datetime
from bs4 import BeautifulSoup, Tag
from playwright.async_api import async_playwright
import requests
import fitz
import pytesseract

REFERENCE_HEADINGS = ["references", "bibliography", "works cited", "reference", "literature cited"]
RE_BRACKET_CITATION = re.compile(r"\[\s*\d+(?:\s*[,;]\s*\d+)*\s*\]")
RE_AUTHOR_YEAR = re.compile(r"\([A-Za-z]+(?:\s+et\s+al)?(?:,\s*\d{4})\)")
RE_FIG_TABLE = re.compile(r"^(figure|fig\.|table)\s*\d+", re.IGNORECASE | re.MULTILINE)

async def fetch_url_async(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000)
        html = await page.content()
        await browser.close()
        return html, "text/html"

def strip_references(text):
    lower = text.lower()
    min_index = None
    for h in REFERENCE_HEADINGS:
        for sep in ["\n", "\r\n"]:
            idx = lower.find(sep + h)
            if idx != -1 and (min_index is None or idx < min_index):
                min_index = idx
        idx = lower.find(h + "\n")
        if idx != -1 and (min_index is None or idx < min_index):
            min_index = idx
    return text[:min_index].strip() if min_index is not None else text.strip()

def clean_text(text):
    t = RE_BRACKET_CITATION.sub("", text)
    t = RE_AUTHOR_YEAR.sub("", t)
    t = RE_FIG_TABLE.sub("", t)
    t = re.sub(r'[\n\t]+', ' ', t)
    t = re.sub(r'\s{2,}', ' ', t)
    t = re.sub(r"[^A-Za-z0-9\s\.,;:\-\'\"\(\)\[\]\?\!]", "", t)
    return t.strip()

async def process_url_async(url):
    try:
        if url.lower().endswith(".pdf"):
            text = await fetch_pdf_text(url)  # Use aiohttp + fitz
            title = "PDF Document"
            src_type = "pdf"
        else:
            html, _ = await fetch_url_async(url)
            title, text = extract_text_from_html(html)
            src_type = "page"

        record = {
            "title": title or "Untitled",
            "link": url,
            "text": text,
            "source_type": src_type,
            "accessed_at": datetime.utcnow().isoformat() + "Z"
        }
        save_json_to_txt(record)
    except Exception as e:
        print(f"Error processing {url}: {e}")

async def fetch_pdf_text(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as r:
            r.raise_for_status()
            pdf_bytes = await r.read()
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pages_text = []
    for page in doc:
        text = page.get_text()
        if not text.strip():
            pix = page.get_pixmap(dpi=200)
            text = pytesseract.image_to_string(pix.get_pil_image())
        pages_text.append(text)
    return "\n\n".join(pages_text)

def extract_text_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "header", "footer", "nav", "aside", "form", "noscript"]):
        tag.decompose()

    junk_keywords = ["cookie", "subscribe", "advert", "promo", "share", "popup", "overlay"]
    for div in soup.find_all("div"):
        try:
            for c in div.get("class") or []:
                if isinstance(c, str) and any(k in c.lower() for k in junk_keywords):
                    div.decompose()
                    break
        except Exception:
            continue

    text = soup.get_text(separator="\n")
    text = re.sub(r"\s{2,}", "\n", text).strip()
    text = strip_references(text)
    text = clean_text(text)

    title = soup.title.string.strip() if soup.title and soup.title.string else (soup.find("h1").get_text().strip() if soup.find("h1") else "Untitled")
    return title, text


def save_json_to_txt(record, filename="data_output.txt"):
    with open(filename, "a", encoding="utf8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

async def process_url_async(url):
    try:
        if url.lower().endswith(".pdf"):
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            text = extract_pdf_bytes(r.content)
            title = "PDF Document"
            src_type = "pdf"
        else:
            html, _ = await fetch_url_async(url)
            title, text = extract_text_from_html(html)
            src_type = "page"

        record = {
            "title": title or "Untitled",
            "link": url,
            "text": text,
            "source_type": src_type,
            "accessed_at": datetime.utcnow().isoformat() + "Z"
        }
        save_json_to_txt(record)
    except Exception as e:
        print(f"Error processing {url}: {e}")

async def main():
    urls = [
        "https://www.cdc.gov/tb/treatment/index.html",
        "https://www.mayoclinic.org/diseases-conditions/lung-cancer/diagnosis-treatment/drc-20374627",
        "https://www.cancer.org/cancer/understanding-cancer/what-is-cancer.html",
        "https://ajronline.org/doi/full/10.2214/AJR.07.3896",
        "https://pmc.ncbi.nlm.nih.gov/articles/PMC3876596/",
        "https://jphe.amegroups.org/article/view/3668/pdf",
        "https://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0161176",
        "https://jamanetwork.com/journals/jama/fullarticle/2777242",
        "https://pmc.ncbi.nlm.nih.gov/articles/PMC11003524/",
        "https://www.sciencedirect.com/science/article/pii/S2950162824000195?utm_source=chatgpt.com",
        "https://www.mdpi.com/2075-4418/15/7/908",
        "https://pmc.ncbi.nlm.nih.gov/articles/PMC12000946/",
        "https://ascopubs.org/doi/10.1200/GO.21.00100",
        "https://ccts.amegroups.org/article/view/46726/html",
        "https://pmc.ncbi.nlm.nih.gov/articles/PMC8113854/",
        "https://bmccancer.biomedcentral.com/articles/10.1186/s12885-024-13350-y",
        "https://www.e-emj.org/journal/view.php?number=1607",
        "https://arxiv.org/pdf/2102.10919",
        "https://arxiv.org/pdf/2102.10919",
        "https://arxiv.org/pdf/2007.14895",
        "https://www.cancer.gov/about-cancer/treatment/types",
        "https://www.cancer.gov/about-cancer/treatment/types/photodynamic-therapy",
        "https://www.cancer.gov/about-cancer/treatment/types/immunotherapy",
        "https://www.cancer.gov/about-cancer/treatment/types/hyperthermia",
        "https://www.cancer.gov/about-cancer/treatment/types/hormone-therapy",
        "https://www.cancer.gov/about-cancer/treatment/types/chemotherapy",
        "https://www.cancer.gov/about-cancer/treatment/types/targeted-therapies",
        "https://www.cancer.gov/about-cancer/treatment/types/surgery",
        "https://www.cancer.gov/about-cancer/treatment/types/stem-cell-transplant",
        "https://www.cancer.gov/about-cancer/treatment/types/radiation-therapy",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/stem-cell-transplant.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/targeted-therapy.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/chemotherapy.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/angiogenesis-inhibitors.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/hyperthermia.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/lasers-in-cancer-treatment.html",
        "https://www.cancer.org/cancer/managing-cancer/treatment-types/tumor-treating-fields.html",
        "https://www.nccih.nih.gov/health/cancer-and-complementary-health-approaches-what-you-need-to-know",
        "https://www.who.int/news-room/fact-sheets/detail/cancer?utm_source=chatgpt.com"
    ]
    for u in urls:
        await process_url_async(u)
        await asyncio.sleep(1)

if __name__ == "__main__":
    await main()

Error processing https://arxiv.org/pdf/2102.10919: Page.goto: Download is starting
Call log:
  - navigating to "https://arxiv.org/pdf/2102.10919", waiting until "load"

Error processing https://arxiv.org/pdf/2102.10919: Page.goto: Download is starting
Call log:
  - navigating to "https://arxiv.org/pdf/2102.10919", waiting until "load"

Error processing https://arxiv.org/pdf/2007.14895: Page.goto: Download is starting
Call log:
  - navigating to "https://arxiv.org/pdf/2007.14895", waiting until "load"

