In [1]:
from src.services.loaders.web import SeticsLoader
from src.configs.env_config import config
from src.services.processors import SeticsDocumentCleaner
from src.services.utils import DocumentJsonToolkit
from pathlib import Path
import json

In [2]:
login_url = "https://support.setics-sttar.com/en/support/login"
protected_url = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/introduction"
base_url = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/"

In [3]:
cleaner = SeticsDocumentCleaner()

In [4]:
async with SeticsLoader() as service:
    output_path = Path("_dev_nb/output_data/web_loader")
    output_path.mkdir(parents=True, exist_ok=True)

    await service.authenticate(
        username=config.SETICS_USER,
        password=config.SETICS_PWD,
        login_url=login_url,
        check_url=protected_url,
    )

    # get all urls from Setics
    print("Discovering URLs...")
    urls = await service.discover_urls(base_url=base_url, max_depth=4)

    # filter 'en' only urls
    print("Filtering URLs...")
    en_urls = [url for url in urls if "/en/" in url]

    # ensure files are created
    urls_file = output_path / "setics_urls.json"
    docs_file = output_path / "setics_docs_raw.json"
    clean_file = output_path / "setics_docs_clean.json"

    # archive the urls
    with open(urls_file, "w") as f:
        json.dump(urls, f, indent=2)

    print(f"Found {len(urls)} URLs, saved to {urls_file}")

    # load documents from the filtered urls
    print("Loading documents...")
    docs = await service.load_documents(urls=en_urls)

    # save the documents
    DocumentJsonToolkit.documents_to_json(documents=docs, filename=docs_file)

    print(f"Loaded {len(docs)} documents, saved to {docs_file}")

    # clean the documents
    print("Cleaning documents...")
    cleaned_docs = await cleaner.clean_documents(documents=docs)

    # save the cleaned documents
    DocumentJsonToolkit.documents_to_json(documents=cleaned_docs, filename=clean_file)

    print(f"Cleaned {len(cleaned_docs)} documents, saved to {clean_file}")

Discovering URLs...
Filtering URLs...
Found 525 URLs, saved to _dev_nb/output_data/web_loader/setics_urls.json
Loading documents...


Fetching pages: 100%|##########| 273/273 [01:48<00:00,  2.52it/s]


Loaded 273 documents, saved to _dev_nb/output_data/web_loader/setics_docs_raw.json
Cleaning documents...
Cleaned 273 documents, saved to _dev_nb/output_data/web_loader/setics_docs_clean.json


In [5]:
# setics_docs_uri = Path("_dev_nb/output_data/web_loader") / "setics_docs.json"
# setics_docs = DocumentJsonToolkit.json_to_documents(setics_docs_uri)
# len(setics_docs)

In [6]:
# sample_docs = setics_docs[:10]

In [7]:
# raw_data = [doc.page_content for doc in sample_docs]
# raw_data

In [8]:
# cleaner = SeticsDocumentCleaner()
# cleaned_docs = await cleaner.clean_documents(documents=sample_docs)

In [9]:
# raw_clean = [doc.page_content for doc in cleaned_docs]
# raw_clean

In [10]:
# for idx, doc in enumerate(cleaned_docs):
#     print(doc.page_content)
#     print("\n\n===PAGE BREAK===\n\n")