In [1]:
import os
import fitz  # PyMuPDF
import requests
import time
import random

In [2]:
def extract_links_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    links = set()
    for page_num, page in enumerate(doc):
        annotations = page.get_links()
        for link in annotations:
            uri = link.get("uri")
            if uri:
                links.add((uri, page_num + 1))
    return list(links)


def classify_links(links):
    pdf_links, sap_notes, others = [], [], []
    for link, page in links:
        if "sap.com" in link and "note" in link:
            sap_notes.append((link, page))
        elif link.lower().endswith(".pdf"):
            pdf_links.append((link, page))
        else:
            others.append((link, page))
    return {"pdfs": pdf_links, "sap_notes": sap_notes, "others": others}

In [3]:
def is_sap_sso_redirect(content):
    return (
        "authn.hana.ondemand.com" in content
        and "saml2/sp/mds" in content
        and "<form" in content.lower()
    )


def download_document(url, save_dir="./linked_docs"):
    os.makedirs(save_dir, exist_ok=True)
    try:
        response = requests.get(url, timeout=10)
        content_type = response.headers.get("Content-Type", "")

        if response.status_code == 200:
            if "text/html" in content_type:
                if is_sap_sso_redirect(response.text):
                    return "LOGIN_REQUIRED"

            filename = os.path.join(save_dir, url.split("/")[-1].split('?')[0])
            with open(filename, 'wb') as f:
                f.write(response.content)
            return filename
        else:
            return None
    except Exception:
        return None


In [4]:
def retry_download(url, retries=3):
    for i in range(retries):
        result = download_document(url)
        if result:
            return result
        time.sleep(2 ** i + random.random())
    return None


def process_extracted_links(pdf_path):
    extracted_links = extract_links_from_pdf(pdf_path)
    classified = classify_links(extracted_links)

    downloaded_files = []
    unavailable_docs = []
    edge_metadata = []

    for category, links in classified.items():
        for url, page in links:
            print(f"Processing: {url}")
            path = retry_download(url)

            if path == "LOGIN_REQUIRED":
                print("🔐 SAP login required. Skipping.")
                unavailable_docs.append(url)
                edge_metadata.append({
                    "source": pdf_path,
                    "link": url,
                    "type": category,
                    "status": "login_required",
                    "source_page": page
                })
            elif path:
                print(f"Downloaded: {path}")
                downloaded_files.append(path)
                edge_metadata.append({
                    "source": pdf_path,
                    "link": url,
                    "type": category,
                    "status": "downloaded",
                    "source_page": page
                })
            else:
                print("Failed to download.")
                unavailable_docs.append(url)
                edge_metadata.append({
                    "source": pdf_path,
                    "link": url,
                    "type": category,
                    "status": "unavailable",
                    "source_page": page
                })

    return downloaded_files, unavailable_docs, edge_metadata


In [5]:
def load_downloaded_documents(download_dir="./linked_docs"):
    loader = SimpleDirectoryReader(input_dir=download_dir)
    return loader.load_data()

In [6]:
if __name__ == "__main__":
    main_pdf_path = "/workspace/OllamaGraphRAGPoC/input-dir/split_5.pdf"

    downloaded, unavailable, metadata = process_extracted_links(main_pdf_path)

    print("\n Downloaded files:", downloaded)
    print("\n Unavailable or Login Required Documents:", unavailable)
    print("\n Link Metadata:")
    for m in metadata:
        print(m)

Processing: http://me.sap.com/notes/3372365
Downloaded: ./linked_docs/3372365
Processing: https://me.sap.com/notes/3102813
Downloaded: ./linked_docs/3102813
Processing: https://www.sap.com/dmc/exp/2014-09-02-hana-hardware/enEN/
Failed to download.
Processing: https://docs.vmware.com/en/VMware-vSphere/8.0/vsphere-vm-administration/GUID-789C3913-1053-4850-A0F0-E29C3D32B6DA.html
Downloaded: ./linked_docs/GUID-789C3913-1053-4850-A0F0-E29C3D32B6DA.html
Processing: https://docs.vmware.com/en/VMware-Cloud-Foundation/5.1/vcf-design/GUID-5B0A8D19-E82C-49B6-BA36-D72FF0A4F9C7.html
Downloaded: ./linked_docs/GUID-5B0A8D19-E82C-49B6-BA36-D72FF0A4F9C7.html
Processing: https://docs.vmware.com/en/VMware-Cloud-Foundation/5.1/vcf-design/GUID-A550B597-463F-403F-BE9A-BFF3BECB9523.html
Downloaded: ./linked_docs/GUID-A550B597-463F-403F-BE9A-BFF3BECB9523.html
Processing: https://www.sap.com/dmc/exp/2014-09-02-hana-hardware/enEN/
Failed to download.
Processing: https://docs.vmware.com/en/VMware-Cloud-Foundatio