## Data Loading
1. Prepare a list of URLs by parsing the watsonx Orchestrate ADK docs site map + a list of IBM Developer tutorials about watsonx Orchestrate.
2. Crawl the URLs
3. Parse the HTML with BeautifulSoup
4. Locate the main "content" of the HTML using an appropriate selector based on the URL
5. Convert the HTML to markdown
6. Find the link (`<a href="...">`) tags in the content and collect the URL. These 'edges' are stored as meta-data on each document
7. Split the text into chunks and create embeddings for each chunk
8. Load the documents into AstraDB

In [None]:
import requests
import time
from typing import Iterable, AsyncIterator
import os
from urllib.parse import urljoin
import aiohttp
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from transformers import AutoTokenizer
from langchain_astradb import AstraDBVectorStore
from langchain_ibm import WatsonxEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [None]:
load_dotenv()

ASTRA_DB_API_ENDPOINT = os.getenv('ASTRA_DB_API_ENDPOINT')
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_COLLECTION = os.getenv('ASTRA_DB_COLLECTION')
WATSONX_APIKEY = os.getenv('WATSONX_APIKEY')
WATSONX_PROJECT_ID = os.getenv('WATSONX_PROJECT_ID')
CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH')

In [None]:
def load_pages_from_sitemap(sitemap_url):
    r = requests.get(
        sitemap_url,
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 "
            "Firefox/58.0",
        },
        timeout=30,
    )
    xml = r.text

    soup = BeautifulSoup(xml, features="xml")
    url_tags = soup.find_all("url")
    for url in url_tags:
        yield (url.find("loc").text)

In [None]:
# Crawl all pages in watsonx Orchestrate ADK sitemap
SITEMAPS = [
    "https://developer.watson-orchestrate.ibm.com/sitemap.xml"
]

# List of IBM Developer tutorials
IBM_DEVELOPER_URLS = [
    "https://developer.ibm.com/tutorials/agentic-workflows-watsonx-orchestrate-no-code", 
    "https://developer.ibm.com/tutorials/agentic-rag-watsonx-orchestrate-astradb", 
    "https://developer.ibm.com/tutorials/getting-started-with-watsonx-orchestrate", 
    "https://developer.ibm.com/tutorials/local-ai-agent-workflow-mcp-watsonx-orchestrate", 
    "https://developer.ibm.com/tutorials/develop-langflow-tools-watsonx-orchestrate-granite", 
    "https://developer.ibm.com/tutorials/build-mcp-tools-mcp-gateway-watsonx-orchestrate-agents", 
    "https://developer.ibm.com/learningpaths/get-started-watsonx-orchestrate", 
    "https://developer.ibm.com/tutorials/create-maximo-agent-watsonx-orchestrate", 
    "https://developer.ibm.com/tutorials/ai-agents-llms-watsonx-orchestrate-ai-gateway", 
    "https://developer.ibm.com/tutorials/connect-mcp-tools-watsonx-orchestrate-adk", 
    "https://developer.ibm.com/tutorials/agentsops-telemetry-langfuse-watsonx-orchestrate", 
    "https://developer.ibm.com/tutorials/integrate-watsonx-orchestrate-agents-custom-ui", 
    "https://developer.ibm.com/tutorials/develop-agents-no-code-watsonx-orchestrate", 
    "https://developer.ibm.com/articles/ai-financial-app-eu-act", 
    "https://developer.ibm.com/articles/multi-agent-orchestration-watsonx-orchestrate", 
    "https://developer.ibm.com/blogs/watsonx-orchestrate-june2025", 
    "https://developer.ibm.com/articles/awb-watsonx-enterprise-data-and-ai-platform", 
    "https://developer.ibm.com/tutorials/awb-building-decision-automation-skills-with-watsonx-orchestrate", 
    "https://developer.ibm.com/articles/awb-enterprise-generative-ai-virtual-assistants-ibm-watsonx", 
    "https://developer.ibm.com/tutorials/awb-custom-skills-openapi-watsonx-orchestrate", 
    "https://developer.ibm.com/articles/awb-overview-ibm-watsonx-orchestrate", 
    "https://developer.ibm.com/articles/use-generative-ai-intelligent-workflow-automation-watsonx", 
    "https://developer.ibm.com/tutorials/awb-automate-talent-acquisition-watsonx-orchestrate-aws", 
    "https://developer.ibm.com/articles/awb-ibm-watsonx-orchestrate-on-aws"]

# Build list of URLs to crawl from sitemap and additional URLs
WXO_URLS = [url for sitemap_url in SITEMAPS for url in load_pages_from_sitemap(sitemap_url)]

print(f"WXO_URLS length = {len(WXO_URLS)}")
print(f"IBM_DEVELOPER_URLS length = {len(IBM_DEVELOPER_URLS)}")

In [None]:
# Build list of href links for provided HTML
def extract_links(html_input: BeautifulSoup, url: str):
    links = set()
    for tag in html_input.find_all("a", href=True):
        href = tag["href"]
        href = urljoin(url, href)
        links.add(href)
        print("  Adding link: "+href)
    return list(links)

In [None]:
# Tokenizer using the same model that will generate the embeddings
tokenizer = AutoTokenizer.from_pretrained('ibm-granite/granite-embedding-278m-multilingual')

# Calculate length of text using tokenizer
def length_function(text):
    return len(tokenizer.encode(text))

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    length_function = length_function,
    chunk_size = 450,
    chunk_overlap = 100
)

In [None]:
# Locate the main content of the HTML, according to the URL
def select_content(soup: BeautifulSoup, url: str) -> BeautifulSoup:
    if url.startswith("https://developer.watson-orchestrate.ibm.com"):
        return soup.find("div", id="content-container")
    if url.startswith("https://developer.ibm.com/tutorials/"):
        return soup.find("section", class_="content-data")
    return soup

In [None]:
# Load watsonx Orchestrate documentation URLs
# Uses a simple asynchronous HTTP request
async def load_wxo_html_pages(urls: Iterable[str]) -> AsyncIterator[Document]:
    async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"}) as session:
        for url in urls:
            try:
                print(f"Crawling page: {url}")
                async with session.get(url, timeout=30) as response:
                    if response.status == 200:
                        html = await response.text()
                        soup = BeautifulSoup(html, "html.parser")                                                
                        soup = select_content(soup, url)
                        text = soup.get_text(separator=" ", strip=True)
                        if text:
                            hyperlinks = extract_links(soup, url)
                            chunks = text_splitter.split_text(text)
                            for i, chunk in enumerate(chunks):
                                content = md(chunk)
                                metadata = {}
                                metadata["id"] = f"chunk{i}_{url}"
                                metadata["url"] = url
                                metadata["hyperlinks"] = hyperlinks
                                document = Document(page_content=content, metadata=metadata)
                                yield document
                        
            except Exception as e:
                print(f"Failed to load {url}: {e}")

In [None]:
# Load the IBM Developer tutorial URLs
# Because these pages use AJAX requests to load the content, a simple HTTP request will not suffice
# Instead, a chrome driver is used which supports JavaScript
# This uses the selenium python library which is not asynchronous
def load_ibm_developer_html_pages(urls: Iterable[str]):

    # Set up Chrome options
    options = Options()
    options.add_argument("--headless")  # Run in headless mode (no browser window)
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")

    # Path to your ChromeDriver
    # Download chrome driver here: https://googlechromelabs.github.io/chrome-for-testing/#stable
    service = Service(CHROME_DRIVER_PATH)  # Replace with your actual path

    # Initialize the browser
    driver = webdriver.Chrome(service=service, options=options)
    docs = []

    for url in urls:
        try:
            print(f"Crawling page: {url}")
            # Load the website
            driver.get(url)

            # Wait for JavaScript and AJAX to finish (adjust time or use WebDriverWait for smarter waits)
            time.sleep(5)

            # Get the fully rendered HTML
            html = driver.page_source

            soup = BeautifulSoup(html, "html.parser")                                                
            soup = select_content(soup, url)
            text = soup.get_text(separator=" ", strip=True)
            if text:
                hyperlinks = extract_links(soup, url)
                chunks = text_splitter.split_text(text)
                for i, chunk in enumerate(chunks):
                    content = md(chunk)
                    metadata = {}
                    metadata["id"] = f"chunk{i}_{url}"
                    metadata["url"] = url
                    metadata["hyperlinks"] = hyperlinks
                    document = Document(page_content=content, metadata=metadata)
                    docs.append(document)

        except Exception as e:
            print(f"Failed to load {url}: {e}")
    
    # Close the browser
    driver.quit()
    return docs

In [None]:
embeddings = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url="https://us-south.ml.cloud.ibm.com",
    apikey=WATSONX_APIKEY,
    project_id=WATSONX_PROJECT_ID,
    params={"truncate_input_tokens": 512}
)

vectorstore = AstraDBVectorStore(
    embedding=embeddings,
    collection_name=ASTRA_DB_COLLECTION,
    pre_delete_collection=True,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
)

In [None]:
# Load the watsonx Orchestrate documents
not_found = 0
found = 0
BATCH_SIZE = 50

docs = []
async for doc in load_wxo_html_pages(WXO_URLS):
    if doc.page_content.startswith("\n# Page Not Found"):
        not_found += 1
        continue

    docs.append(doc)
    found += 1

    if len(docs) >= BATCH_SIZE:
        vectorstore.add_documents(docs)
        docs.clear()

if docs:
    vectorstore.add_documents(docs)
    
print(f"{not_found} URLs were not found")
print(f"{found} URLs were added")

In [None]:
# Load IBM Developer tutorial documents
not_found = 0
found = 0
BATCH_SIZE = 50
os.environ["TOKENIZERS_PARALLELISM"] = "false"

docs = []
for doc in load_ibm_developer_html_pages(IBM_DEVELOPER_URLS):
    if doc.page_content.startswith("\n# Page Not Found"):
        not_found += 1
        continue

    docs.append(doc)
    found += 1

    if len(docs) >= BATCH_SIZE:
        vectorstore.add_documents(docs)
        docs.clear()

if docs:
    vectorstore.add_documents(docs)
    
print(f"{not_found} URLs were not found")
print(f"{found} URLs were added")