In [1]:
from src.services.loaders.web.web_image_loader import WebImageLoader
from src.configs.env_config import config
import json
from pathlib import Path
from src.services.utils import DocumentJsonToolkit

In [2]:
login_url = "https://support.setics-sttar.com/en/support/login"
protected_url = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/introduction"
base_url_stad = "https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/"
base_url_stpl = "https://docs.setics-sttar.com/planner-user-manual/2.3/en/"

In [3]:
output_path = Path("_dev_nb/output_data/web_loader")
output_path.mkdir(parents=True, exist_ok=True)

urls_file = output_path / "setics_stad_urls.json"
stad_img_json = output_path / "setics_images_docs_raw.json"

In [4]:
urls = json.loads(urls_file.read_text(encoding="utf-8"))[:10]
urls

['https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/topology',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/endpoint-support-context-menu',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/search-by-cost-effectiveness-using-actual-costs',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/naming-rules-syntax',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/duct-assembly-datasheet',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/cable-system-commands',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/splicing-plans-options',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/start-network-optimization',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/installing-a-workstation-license',
 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/to

In [5]:
loader = await WebImageLoader.create_protected_loader(
    username=config.SETICS_USER,
    password=config.SETICS_PWD,
    login_url=login_url,
    check_url=protected_url,
)

In [6]:
for target in ["stad", "stpl"]:
    print(f"Loading {target} URLs...")
    urls = json.loads(
        (output_path / f"setics_{target}_urls.json").read_text(encoding="utf-8")
    )

    print(f"Found {len(urls)} URLs for {target}", "Parsing images...", sep="\n")

    docs = await loader.download_and_parse_images(urls=urls)

    print(f"Parsed {len(docs)} documents")

    img_file = output_path / f"setics_{target}_img_docs.json"

    print(f"Saving documents to {img_file}")
    DocumentJsonToolkit.documents_to_json(documents=docs, filename=img_file)

Loading stad URLs...
Found 525 URLs for stad
Parsing images...
Parsed 424 documents
Saving documents to _dev_nb/output_data/web_loader/setics_stad_img_docs.json
Loading stpl URLs...
Found 960 URLs for stpl
Parsing images...
Parsed 761 documents
Saving documents to _dev_nb/output_data/web_loader/setics_stpl_img_docs.json


In [7]:
# await loader.initialize()
# await loader.authenticate(
#     username=config.SETICS_USER,
#     password=config.SETICS_PWD,
#     login_url=login_url,
#     check_url=protected_url,
# )

In [8]:
# image_urls_by_page = await loader.extract_image_urls(urls=urls)
# image_urls_by_page

In [9]:
# img1 = image_urls_by_page[:2]
# documents = await loader.download_and_parse_images(image_refs=img1)

# for doc in documents:
#     print(doc.page_content)
#     print("\n\n===\n\n")

In [10]:
# documents = await loader.download_and_parse_images(urls=urls)
# len(documents)

In [11]:
# for i, doc in enumerate(documents):
#     print(f"=== Document {i} - length: {len(doc.page_content)} ===\n\n")
#     print(doc.page_content)

In [12]:
# custom_prompt_template = """You are an advanced image analysis assistant tasked with extracting and describing visual content.

# ## Part 1: Concise Summary
# Provide a concise summary of the image optimized for retrieval (1-2 sentences).

# ## Part 2: Visual Description
# Describe the image in detail, including:
# - Type of image (diagram, screenshot, photograph, chart, etc.)
# - Key visual elements and their arrangement
# - UI elements if this is a screenshot
# - Any notable visual patterns or structures

# ## Part 3: Text Content
# Extract ALL text visible in the image, preserving:
# - Headers and titles
# - Menu items and navigation elements
# - Labels and annotations
# - Table contents
# - Text in diagrams or charts
# - Button text and UI elements
# - Any other textual information

# You MUST ensure that your final output is contained within 1800 characters. Output should be in markdown format, without explanatory text, without any unnecessary whitespace (no extra line breaks or indentations) and without markdown delimiter ``` at the beginning.
# """

# custom_prompt = PromptTemplate.from_template(custom_prompt_template)

In [13]:
# custom_prompt_template = """You are an advanced image analysis assistant tasked with extracting and describing visual content for a retrieval-augmented generation system.

# ## Part 1: Concise Summary (50-75 words)
# Provide a concise summary that captures the image's core purpose and content. Focus on what the image represents functionally rather than just visually.

# ## Part 2: Visual Description (100-150 words)
# Describe the image with these key aspects:
# - Type of image (screenshot, diagram, chart, photo, etc.)
# - Main subject and its purpose or function
# - Key visual elements and their relationships
# - For screenshots: interface purpose, main controls, and data being displayed
# - For diagrams/charts: what information is being conveyed and how

# ## Part 3: Text Content (remaining space)
# Extract ALL visible text in the image, prioritizing by importance:
# 1. Headers, titles and key labels
# 2. Navigation elements and structural information
# 3. Table headers and important cell data
# 4. Button text and interactive elements
# 5. Supporting text and annotations

# IMPORTANT:
# - Ensure high semantic density by focusing on meaningful content
# - Avoid redundancy between sections
# - Organize text content logically by visual hierarchy
# - Include semantic markers for context (e.g., "Button:", "Menu:", "Header:")
# - Prioritize completeness of text extraction over visual description if space is limited

# Format output as compact markdown with minimal formatting, without explanatory text, and without markdown delimiter ``` at the beginning.
# Total output must be under 1800 characters.
# """

# custom_prompt = PromptTemplate.from_template(custom_prompt_template)

In [14]:
# custom_prompt_template = """You are an advanced image analysis assistant tasked with extracting and describing visual content for a retrieval-augmented generation system.

# ## Part 1: Concise Summary (50-75 words)
# Provide a concise summary that captures the image's core purpose and content. Focus on what the image represents functionally rather than just visually.

# ## Part 2: Visual Description (100-150 words)
# Describe the image with these key aspects:
# - Type of image (screenshot, diagram, chart, photo, etc.)
# - Main subject and its purpose or function
# - Key visual elements and their relationships
# - For screenshots: interface purpose, main controls, and data being displayed
# - For diagrams/charts: what information is being conveyed and how

# ## Part 3: Text Content (remaining space)
# Extract ALL visible text in the image, prioritizing by importance:
# 1. Headers, titles and key labels
# 2. Navigation elements and structural information
# 3. Table headers and important cell data
# 4. Button text and interactive elements
# 5. Supporting text and annotations

# IMPORTANT FORMATTING INSTRUCTIONS:
# - Use proper markdown tables for any tabular data (with | and - formatting)
# - Ensure high semantic density by focusing on meaningful content
# - Avoid redundancy between sections
# - Organize text content logically by visual hierarchy
# - Include semantic markers for context (e.g., "Button:", "Menu:", "Header:")
# - Prioritize completeness of text extraction over visual description if space is limited

# Format output as compact markdown with minimal formatting, without explanatory text, and without markdown delimiter ``` at the beginning.
# Total output must be under 1800 characters.
# """

# custom_prompt = PromptTemplate.from_template(custom_prompt_template)

In [15]:
# custom_prompt_template = """You are an advanced image analysis assistant tasked with extracting and describing visual content for a retrieval-augmented generation system.

# ## Part 1: Concise Summary (50-75 words)
# Provide a concise summary that captures the image's core purpose and content. Focus on what the image represents functionally rather than just visually.

# ## Part 2: Visual Description (100-150 words)
# Describe the image with these key aspects:
# - Type of image (screenshot, diagram, chart, photo, etc.)
# - Main subject and its purpose or function
# - Key visual elements and their relationships
# - For screenshots: interface purpose and main controls
# - For diagrams/charts: what information is being conveyed and how

# ## Part 3: Text Content (remaining space)
# Extract ALL visible text in the image, prioritizing by importance:
# 1. Headers, titles and key labels
# 2. Table data (using proper markdown tables)
# 3. Critical UI elements and buttons
# 4. Supporting text and annotations

# IMPORTANT FORMATTING INSTRUCTIONS:
# - Use proper markdown tables for tabular data (with | and - formatting)
# - If needed, abbreviate long table headers to save space
# - For large tables, prioritize headers and most important rows
# - Include semantic markers for non-table elements (e.g., "Button:", "Header:")
# - Be extremely concise in all sections while preserving critical information

# Format output as compact markdown with minimal formatting.
# Output must be without explanatory text, and without markdown delimiter ``` at the beginning.
# Total output must be under 2500 characters.
# """

# custom_prompt = PromptTemplate.from_template(custom_prompt_template)

In [16]:
# custom_prompt_template = """You are an advanced image analysis assistant tasked with extracting and describing visual content for a retrieval-augmented generation system.

# ## Part 1: Concise Summary (50-75 words)
# Provide a concise summary that captures the image's core purpose and content. Focus on what the image represents functionally rather than just visually. Identify the primary information being conveyed.

# ## Part 2: Visual Description (100-150 words)
# Describe the image with these key aspects:
# - Type of image (screenshot, diagram, chart, photo, etc.) or combination if mixed format
# - Main subject and its purpose or function
# - Key visual elements, their relationships, and hierarchical organization
# - For screenshots: interface purpose and main controls
# - For diagrams/charts: what information is being conveyed and key relationships
# - For flowcharts/processes: sequence, decision points, and connections between elements

# ## Part 3: Text Content (remaining space)
# Extract ALL visible text in the image, prioritizing by importance:
# 1. Headers, titles and key labels
# 2. Table data (using proper markdown tables)
# 3. Critical UI elements and buttons
# 4. Supporting text and annotations

# IMPORTANT FORMATTING INSTRUCTIONS:
# - Use proper markdown tables for tabular data (with | and - formatting)
# - For complex or multiple tables, include a brief label before each table
# - If needed, abbreviate long table headers to save space
# - For large tables, prioritize headers and most important rows
# - Include semantic markers for non-table elements (e.g., "Button:", "Header:")
# - For diagrams with connected elements, indicate relationships with "→" or similar notation
# - If text is unclear or possibly inaccurate due to image quality, indicate with [?]
# - For mathematical equations or special notation, use markdown's math formatting
# - Be extremely concise in all sections while preserving critical information

# Format output as compact markdown with minimal formatting.
# Output must be without explanatory text, and without markdown delimiter ``` at the beginning.
# Total output must be under 2500 characters.
# """

# custom_prompt = PromptTemplate.from_template(custom_prompt_template)

In [17]:
# async def download_and_parse_image(image_ref: Dict[str, str]) -> Document:
#     image_url = image_ref["url"]
#     llm = ChatOpenAI(model="gpt-4o-mini", api_key=config.OPENAI_API_KEY)
#     parser = LLMImageBlobParser(model=llm, prompt=custom_prompt)
#     try:
#         async with aiohttp.ClientSession() as session:
#             async with session.get(image_url) as response:
#                 if response.status != 200:
#                     print(f"Failed to download image: {response.status}")
#                     return None

#                 # Get binary data
#                 image_data = await response.read()

#                 # Create blob object
#                 blob = Blob(data=image_data, metadata=image_ref)

#                 # Parse with LLMImageBlobParser
#                 documents = parser.parse(blob=blob)[0]

#                 return documents

#     except Exception as e:
#         print(f"Error processing image: {str(e)}")
#         return None

In [18]:
# img1 = image_urls_by_page[0]
# document = await download_and_parse_image(image_ref=img1)

In [19]:
# print(document.page_content)

In [20]:
# len(document.page_content)