In [2]:
import codecs, json, random
from pathlib import Path
import asyncio
from crawl4ai import *
from tqdm import tqdm
import warnings
import time

In [3]:
BASE = Path().cwd()
BASE

PosixPath('/home/thinkingduck/Life/Menon/Scrapper')

In [4]:
MENO_RESOURCES = BASE.joinpath("resources")
MENO_DATABASE = BASE.joinpath("meno_database")

print(f"isdir({MENO_RESOURCES}) = {MENO_RESOURCES.is_dir()}")
print(f"isdir({MENO_DATABASE}) = {MENO_DATABASE.is_dir()}")

isdir(/home/thinkingduck/Life/Menon/Scrapper/resources) = True
isdir(/home/thinkingduck/Life/Menon/Scrapper/meno_database) = True


In [5]:
url_fname = MENO_RESOURCES.joinpath("web_urls.json")
print(f"isfile({url_fname}) = {url_fname.is_file()}")

isfile(/home/thinkingduck/Life/Menon/Scrapper/resources/web_urls.json) = True


In [6]:
with url_fname.open(mode="r", encoding="utf-8", errors="ignore") as fp:
    url_data = json.load(fp)

In [7]:
url_dict = dict()
for doc_name in url_data:
    doc_url = url_data[doc_name]
    if doc_url in url_dict:
        url_dict[doc_url].add(doc_name)
    else:
        url_dict[doc_url] = {doc_name}
for doc_url in url_dict:
    name_variants = sorted(list(url_dict[doc_url]), key=lambda it: -len(it))
    url_dict[doc_url] = " ".join(name_variants[0].strip().split()).strip()

In [8]:
del url_data

In [9]:
print(f"There are {len(url_dict)} documents.")

There are 1283 documents.


In [10]:
url_list = sorted(list(url_dict.keys()))

In [11]:
for doc_url in random.sample(population=url_list, k=3):
    print(doc_url)
    print(url_dict[doc_url])
    print("")

https://museum.nsu.ru/
Список ректоров Новосибирского государственного университета от 1959 года до наших дней

https://nsu.ru/n/research/divisions/mathematical-sciences/1719941/
Моделирование и развитие технологий волоконно-оптической связи в Новосибирском государственном университете

https://www.nsu.ru/n/research/divisions/physics/1719857/
Центр фотоники Астон-Новосибирского государственного университета: Научные исследования и образовательные программы в области оптики и лазерных технологий



In [12]:
browser_config = BrowserConfig(verbose=False)
run_config = CrawlerRunConfig(
    markdown_generator=DefaultMarkdownGenerator(
        content_filter=PruningContentFilter(threshold=0.6),
        options={"ignore_links": True},
    ),
    word_count_threshold=10,  # Minimum words per content block
    excluded_tags=["form", "header"],
    exclude_external_links=True,  # Remove external links
    remove_overlay_elements=True,  # Remove popups/modals
    process_iframes=True,
    verbose=False,
)

In [15]:
async def main():
    file_path = MENO_DATABASE.joinpath("actual_knowledge.jsonl")
    with open(file_path, mode="w", encoding="utf-8") as fp:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            for doc_url in tqdm(url_list):
                    result = await crawler.arun(url=doc_url, config=run_config)
                    if result.success:
                        jsonified_result = {
                            "url": doc_url,
                            "name": url_dict[doc_url],
                            "content": result.markdown.fit_markdown,
                            "date": None,
                            "collection_date": int(time.time()),
                            
                        }
                        fp.write(json.dumps(jsonified_result, ensure_ascii=False) + "\n")
                    else:
                        warnings.warn(
                            f"{doc_url}: Status code = {result.status_code}, crawl failed: {result.error_message}"
                        )

In [16]:
await main()

100%|██████████| 1283/1283 [48:23<00:00,  2.26s/it] 
