In [3]:
from src.services.loaders.web import SeticsLoader
from src.configs.env_config import config
from src.services.cleaners import SeticsDocumentCleaner
from src.services.utils import documents_to_json
from pathlib import Path
import json

In [4]:
login_url = "https://support.setics-sttar.com/en/support/login"
protected_url = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/introduction"
base_url_stad = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/"
base_url_stpl = "https://docs.setics-sttar.com/planner-user-manual/2.3/en/"

In [5]:
cleaner = SeticsDocumentCleaner()

In [6]:
for name, base_url in [("stad", base_url_stad), ("stpl", base_url_stpl)]:
    print(f"Processing base URL: {base_url}")

    async with SeticsLoader() as service:
        output_path = Path("_dev_nb/output_data/web_loader")
        output_path.mkdir(parents=True, exist_ok=True)

        await service.authenticate(
            username=config.SETICS_USER,
            password=config.SETICS_PWD,
            login_url=login_url,
            check_url=protected_url,
        )

        # get all urls from Setics
        print("Discovering URLs...")
        urls = await service.discover_urls(base_url=base_url, max_depth=4)

        # # filter 'en' only urls
        # print("Filtering URLs...")
        # en_urls = [url for url in urls if "/en/" in url]

        # ensure files are created
        urls_file = output_path / f"setics_{name}_urls.json"
        docs_file = output_path / f"setics_{name}_docs_raw.json"
        clean_file = output_path / f"setics_{name}_docs_clean.json"

        # archive the urls
        with open(urls_file, "w") as f:
            json.dump(urls, f, indent=2)

        print(f"Found {len(urls)} URLs, saved to {urls_file}")

        # load documents from the filtered urls
        print("Loading documents...")
        docs = await service.load_documents(urls=urls)

        # save the documents
        documents_to_json(documents=docs, filename=docs_file)

        print(f"Loaded {len(docs)} documents, saved to {docs_file}")

        # clean the documents
        print("Cleaning documents...")
        cleaned_docs = await cleaner.clean_documents(documents=docs)

        # save the cleaned documents
        documents_to_json(documents=cleaned_docs, filename=clean_file)

        print(f"Cleaned {len(cleaned_docs)} documents, saved to {clean_file}")

Processing base URL: https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/
Discovering URLs...
Found 525 URLs, saved to _dev_nb/output_data/web_loader/setics_stad_urls.json
Loading documents...


Fetching pages: 100%|##########| 525/525 [03:18<00:00,  2.64it/s]


Loaded 525 documents, saved to _dev_nb/output_data/web_loader/setics_stad_docs_raw.json
Cleaning documents...
Cleaned 525 documents, saved to _dev_nb/output_data/web_loader/setics_stad_docs_clean.json
Processing base URL: https://docs.setics-sttar.com/planner-user-manual/2.3/en/
Discovering URLs...
Found 960 URLs, saved to _dev_nb/output_data/web_loader/setics_stpl_urls.json
Loading documents...


Fetching pages: 100%|##########| 960/960 [07:31<00:00,  2.13it/s]


Loaded 960 documents, saved to _dev_nb/output_data/web_loader/setics_stpl_docs_raw.json
Cleaning documents...
Cleaned 960 documents, saved to _dev_nb/output_data/web_loader/setics_stpl_docs_clean.json


In [None]:
# stad_uri = Path("_dev_nb/output_data/web_loader") / "setics_stad_docs_raw.json"
# stpl_uri = Path("_dev_nb/output_data/web_loader") / "setics_stpl_docs_raw.json"

# for name, file in [("stad", stad_uri), ("stpl", stpl_uri)]:
#     print(f"Processing base URL: {file}")

#     async with SeticsLoader() as service:
#         output_path = Path("_dev_nb/output_data/web_loader")
#         output_path.mkdir(parents=True, exist_ok=True)

#         # ensure files are created
#         urls_file = output_path / f"setics_{name}_urls.json"
#         docs_file = output_path / f"setics_{name}_docs_raw.json"
#         clean_file = output_path / f"setics_{name}_docs_clean.json"

#         # load documents from the filtered urls
#         print("Loading documents...")
#         docs = DocumentJsonToolkit.json_to_documents(filename=file)

#         # clean the documents
#         print("Cleaning documents...")
#         cleaned_docs = await cleaner.clean_documents(documents=docs)

#         # save the cleaned documents
#         DocumentJsonToolkit.documents_to_json(
#             documents=cleaned_docs, filename=clean_file
#         )

#         print(f"Cleaned {len(cleaned_docs)} documents, saved to {clean_file}")

In [None]:
# setics_docs_uri = Path("_dev_nb/output_data/web_loader") / "setics_stad_docs_raw.json"
# setics_docs = DocumentJsonToolkit.json_to_documents(setics_docs_uri)
# len(setics_docs)

In [None]:
# sample_docs = setics_docs[:2]

In [None]:
# raw_data = [doc.page_content for doc in sample_docs]
# raw_data

In [None]:
# cleaner = SeticsDocumentCleaner()
# cleaned_docs = await cleaner.clean_documents(documents=sample_docs)

In [None]:
# raw_clean = [doc.page_content for doc in cleaned_docs]
# raw_clean

In [None]:
# for idx, doc in enumerate(cleaned_docs):
#     print(doc.page_content)
#     print("\n\n===PAGE BREAK===\n\n")