In [28]:
import codecs, json, random
from pathlib import Path
import asyncio
from crawl4ai import *
from tqdm import tqdm
import warnings

In [29]:
BASE = Path().cwd()
BASE

PosixPath('/home/thinkingduck/Life/Menon/Scrapper')

In [30]:
MENO_RESOURCES = BASE.joinpath("resources")
MENO_DATABASE = BASE.joinpath("meno_database")

print(f"isdir({MENO_RESOURCES}) = {MENO_RESOURCES.is_dir()}")
print(f"isdir({MENO_DATABASE}) = {MENO_DATABASE.is_dir()}")

isdir(/home/thinkingduck/Life/Menon/Scrapper/resources) = True
isdir(/home/thinkingduck/Life/Menon/Scrapper/meno_database) = True


In [31]:
url_fname = MENO_RESOURCES.joinpath("web_urls.json")
print(f"isfile({url_fname}) = {url_fname.is_file()}")

isfile(/home/thinkingduck/Life/Menon/Scrapper/resources/web_urls.json) = True


In [32]:
with url_fname.open(mode="r", encoding="utf-8", errors="ignore") as fp:
    url_data = json.load(fp)

In [33]:
url_dict = dict()
for doc_name in url_data:
    doc_url = url_data[doc_name]
    if doc_url in url_dict:
        url_dict[doc_url].add(doc_name)
    else:
        url_dict[doc_url] = {doc_name}
for doc_url in url_dict:
    name_variants = sorted(list(url_dict[doc_url]), key=lambda it: -len(it))
    url_dict[doc_url] = " ".join(name_variants[0].strip().split()).strip()

In [34]:
del url_data

In [35]:
print(f"There are {len(url_dict)} documents.")

There are 1283 documents.


In [36]:
url_list = sorted(list(url_dict.keys()))

In [37]:
for doc_url in random.sample(population=url_list, k=3):
    print(doc_url)
    print(url_dict[doc_url])
    print("")

https://www.nsu.ru/n/economics-department/research/grants/
Образование и Научная Деятельность Экономического Факультета Новосибирского Государственного Университета

https://fen.nsu.ru/fen.phtml?topic=prg_bac_060301_bio_new
Программа обучения на факультете естественных наук: основные дисциплины и практики

http://sesc.nsu.ru/education/sections/fiziki/
Справочник преподавателей кафедры физики Специализированного учебно-научного центра Новосибирского государственного университета



In [38]:
browser_config = BrowserConfig(verbose=False)
run_config = CrawlerRunConfig(
    markdown_generator=DefaultMarkdownGenerator(
        content_filter=PruningContentFilter(threshold=0.6),
        options={"ignore_links": True},
    ),
    word_count_threshold=10,  # Minimum words per content block
    excluded_tags=["form", "header"],
    exclude_external_links=True,  # Remove external links
    remove_overlay_elements=True,  # Remove popups/modals
    process_iframes=True,
    verbose=False,
)

In [39]:
async def main():
    with codecs.open(
        MENO_DATABASE.joinpath("actual_knowledge.jsonl"),
        mode="w",
        encoding="utf-8",
        buffering=0,
    ) as fp:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            for doc_url in tqdm(url_list):
                    result = await crawler.arun(url=doc_url, config=run_config)
                    if result.success:
                        jsonified_result = {
                            "url": doc_url,
                            "name": url_dict[doc_url],
                            "content": result.markdown.fit_markdown,
                        }
                        fp.write(json.dumps(jsonified_result, ensure_ascii=False) + "\n")
                    else:
                        warnings.warning(
                            f"{doc_url}: Status code = {result.status_code}, crawl failed: {result.error_message}"
                        )

In [None]:
await main()