In [15]:
import codecs, json, random
from pathlib import Path
import asyncio
from crawl4ai import *
from tqdm import tqdm
import warnings
import time
import datetime


In [16]:
BASE = Path().cwd()
BASE


PosixPath('/Users/white/shad/Scrapper')

In [17]:
RESOURCES_DIR = BASE.joinpath("resources")
SCRAPPED_DATA_DIR = BASE.joinpath("scrapped_data")

print(f"isdir({RESOURCES_DIR}) = {RESOURCES_DIR.is_dir()}")
print(f"isdir({SCRAPPED_DATA_DIR}) = {SCRAPPED_DATA_DIR.is_dir()}")


isdir(/Users/white/shad/Scrapper/resources) = True
isdir(/Users/white/shad/Scrapper/scrapped_data) = True


In [18]:
url_fname = RESOURCES_DIR.joinpath("web_urls.json")
print(f"isfile({url_fname}) = {url_fname.is_file()}")


isfile(/Users/white/shad/Scrapper/resources/web_urls.json) = True


In [19]:
with url_fname.open(mode="r", encoding="utf-8", errors="ignore") as fp:
    url_data = json.load(fp)


In [20]:
url_dict = dict()
for doc_name in url_data:
    doc_url = url_data[doc_name]
    if doc_url in url_dict:
        url_dict[doc_url].add(doc_name)
    else:
        url_dict[doc_url] = {doc_name}
for doc_url in url_dict:
    name_variants = sorted(list(url_dict[doc_url]), key=lambda it: -len(it))
    url_dict[doc_url] = " ".join(name_variants[0].strip().split()).strip()


In [21]:
del url_data


In [22]:
print(f"There are {len(url_dict)} documents.")


There are 1283 documents.


In [23]:
url_list = sorted(list(url_dict.keys()))


In [24]:
for doc_url in random.sample(population=url_list, k=3):
    print(doc_url)
    print(url_dict[doc_url])
    print("")


https://fen.nsu.ru/fen.phtml?topic=specc
–°—Ç—Ä—É–∫—Ç—É—Ä–∞ –∫–∞—Ñ–µ–¥—Ä —Ñ–∞–∫—É–ª—å—Ç–µ—Ç–∞ –ï—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ù–∞—É–∫ –Ω–∞ –æ—Å–µ–Ω–Ω–∏–π –∏ –≤–µ—Å–µ–Ω–Ω–∏–π —Å–µ–º–µ—Å—Ç—Ä—ã

https://www.nsu.ru/n/economics-department/prepodavateli/3540521/
–ü—Ä–æ—Ñ–µ—Å—Å–∏—è —ç–∫–æ–Ω–æ–º–∏—Å—Ç–∞ –≤ –Ω–µ—Ñ—Ç–µ–≥–∞–∑–æ–≤–æ–π –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ—Å—Ç–∏: –û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ, –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è –∏ –¥–æ—Å—Ç–∏–∂–µ–Ω–∏—è

https://education.nsu.ru/journalism_master_producing/
–ú–∞–≥–∏—Å—Ç—Ä–∞—Ç—É—Ä–∞ –ù–æ–≤–æ—Å–∏–±–∏—Ä—Å–∫–æ–≥–æ –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞: –ò—Å–∫—É—Å—Å—Ç–≤–æ –∏ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏ –≤–∏–¥–µ–æ–ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–∞



In [25]:
browser_config = BrowserConfig(verbose=False)
run_config = CrawlerRunConfig(
    markdown_generator=DefaultMarkdownGenerator(
        content_filter=PruningContentFilter(threshold=0.6),
        options={"ignore_links": True},
    ),
    word_count_threshold=10,  # Minimum words per content block
    excluded_tags=["form", "header"],
    exclude_external_links=True,  # Remove external links
    remove_overlay_elements=True,  # Remove popups/modals
    process_iframes=True,
    verbose=False,
)


In [26]:
async def main():
    # 1. –§–æ—Ä–º–∏—Ä—É–µ–º –∏–º—è —Ñ–∞–π–ª–∞ —Å —Ç–µ–∫—É—â–µ–π –¥–∞—Ç–æ–π
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    filename = f"web_scrapped_{current_date}.jsonl"
    file_path = SCRAPPED_DATA_DIR.joinpath(filename)

    print(f"Saving data to {file_path}")

    success_count = 0
    fail_count = 0

    # 2. –°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö
    with open(file_path, mode="w", encoding="utf-8") as fp:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            for doc_url in tqdm(url_list):
                try:
                    result = await crawler.arun(url=doc_url, config=run_config)

                    if result.success:
                        jsonified_result = {
                            "url": doc_url,
                            "name": url_dict[doc_url],
                            "content": result.markdown.fit_markdown,
                            "date": None,  # –î–ª—è –≤–µ–±-—Å—Ç—Ä–∞–Ω–∏—Ü —á–∞—Å—Ç–æ –Ω–µ—Ç —è–≤–Ω–æ–π –¥–∞—Ç—ã –ø—É–±–ª–∏–∫–∞—Ü–∏–∏
                            "collection_date": int(time.time()),
                        }
                        fp.write(
                            json.dumps(jsonified_result, ensure_ascii=False) + "\n"
                        )
                        fp.flush()  # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—Ä–∞–∑—É
                        success_count += 1
                    else:
                        fail_count += 1
                        # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –ø—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ, –Ω–æ –Ω–µ –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º —Å–∫—Ä–∏–ø—Ç
                        warnings.warn(
                            f"FAIL {doc_url}: Status={result.status_code}, Error={result.error_message}"
                        )
                except Exception as e:
                    fail_count += 1
                    print(f"EXCEPTION {doc_url}: {e}")

    # 3. –ò—Ç–æ–≥–æ–≤—ã–π –æ—Ç—á–µ—Ç
    print("-" * 40)
    print(f"üéâ –ì–æ—Ç–æ–≤–æ!")
    print(f"–í—Å–µ–≥–æ URLs: {len(url_list)}")
    print(f"‚úÖ –£—Å–ø–µ—à–Ω–æ: {success_count}")
    if fail_count > 0:
        print(f"‚ö†Ô∏è –û—à–∏–±–æ–∫: {fail_count} (—Å–º. –ø—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏—è –≤—ã—à–µ)")
    else:
        print(f"–û—à–∏–±–æ–∫: 0")
    print(f"–§–∞–π–ª: {file_path}")


In [27]:
await main()


Saving data to /Users/white/shad/Scrapper/scrapped_data/web_scrapped_2026-01-20.jsonl


Error: Failed on navigating ACS-GOTO:
Page.goto: Timeout 60000ms exceeded.
Call log:
  - navigating to "http://www.ict.nsc.ru/", waiting until "domcontentloaded"


Code context:
 713                                   tag="GOTO",
 714                                   params={"url": url},
 715                               )
 716                               response = None
 717                           else:
 718 ‚Üí                             raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
 719   
 720                       # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
 721                       # Walk the redirect chain.  Playwright returns only the last
 722                       # hop, so we trace the `request.redirected_from` links until the
 723                       # first response that differs from the final one and s

----------------------------------------
üéâ –ì–æ—Ç–æ–≤–æ!
–í—Å–µ–≥–æ URLs: 1283
‚úÖ –£—Å–ø–µ—à–Ω–æ: 1265
‚ö†Ô∏è –û—à–∏–±–æ–∫: 18 (—Å–º. –ø—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏—è –≤—ã—à–µ)
–§–∞–π–ª: /Users/white/shad/Scrapper/scrapped_data/web_scrapped_2026-01-20.jsonl



