In [1]:
from dotenv import load_dotenv
load_dotenv()
from bs4 import BeautifulSoup
from bs4.element import Comment

def extract_leaves_text(html_file):
    """
    Given an HTML file path, return a list of visible leaf text nodes.
    """
    with open(html_file, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, "html.parser")

    def is_visible(element):
        if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
            return False
        if isinstance(element, Comment):
            return False
        if not element.string or not element.string.strip():
            return False
        return True

    def is_leaf(element):
        """Check if element is a visible leaf text node."""
        if not is_visible(element):
            return False
        for sibling in element.parent.find_all(string=True, recursive=False):
            if sibling != element and is_visible(sibling):
                return False
        return True

    leaf_texts = []
    for element in soup.find_all(string=True):
        if is_leaf(element):
            leaf_texts.append(element.strip())

    return leaf_texts

leaves = extract_leaves_text('about.html')

In [2]:
from openai import OpenAI
from Embedding import get_embedding, batch_get_embeddings
from tqdm import tqdm
client = OpenAI(api_key=None)

leaf_embeddings = []
for leaf in tqdm(leaves):
    leaf_embedding = get_embedding(client, leaf)
    leaf_embeddings.append(leaf_embedding)

100%|██████████| 93/93 [00:30<00:00,  3.01it/s]


In [4]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import numpy as np

def compute_branch_embeddings_with_leaves(html_file, leaf_embeddings):
    """
    Given an HTML file and a list of embeddings for all leaf text nodes in order,
    compute the average embedding of all leaf embeddings for each branch.

    Args:
        html_file (str): Path to HTML file.
        leaf_embeddings (list[np.array]): List of embeddings for leaf texts.

    Returns:
        list of dicts: [
            {
                "branch_html": str,
                "average_embedding": np.array,
                "leaf_texts": list[str]
            },
            ...
        ]
    """
    with open(html_file, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, "html.parser")

    def is_visible(element):
        if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
            return False
        if isinstance(element, Comment):
            return False
        if not element.string or not element.string.strip():
            return False
        return True

    def is_leaf(element):
        """Check if element is a visible leaf text node."""
        if not is_visible(element):
            return False
        for sibling in element.parent.find_all(string=True, recursive=False):
            if sibling != element and is_visible(sibling):
                return False
        return True

    # Step 1: Collect all leaf text nodes in order
    all_leaf_texts = []
    leaf_nodes = []
    for element in soup.find_all(string=True):
        if is_leaf(element):
            all_leaf_texts.append(element.strip())
            leaf_nodes.append(element)

    if len(all_leaf_texts) != len(leaf_embeddings):
        raise ValueError(f"Number of leaf embeddings ({len(leaf_embeddings)}) does not match number of leaves ({len(all_leaf_texts)}).")

    # Step 2: Map leaf nodes to embeddings by index
    leaf_to_embedding = {leaf_nodes[i]: leaf_embeddings[i] for i in range(len(leaf_nodes))}

    branch_data = []

    # Step 3: Walk DOM and collect branch-level embeddings
    for tag in soup.find_all(True):
        leaf_texts = []
        embeddings = []

        for descendant in tag.find_all(string=True, recursive=True):
            if is_leaf(descendant) and descendant.parent == tag:
                leaf_texts.append(descendant.strip())
                embeddings.append(leaf_to_embedding[descendant])

        if embeddings:
            avg_embedding = np.mean(embeddings, axis=0)
            branch_data.append({
                "branch_html": str(tag),
                "average_embedding": avg_embedding,
                "leaf_texts": leaf_texts
            })

    return branch_data

In [5]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import numpy as np

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm1 * norm2)


def find_top_branches(html_file, leaf_embeddings, query_embedding, top_n=10):
    """
    Given an HTML file, list of leaf embeddings, and a query embedding,
    return the top N branches with the highest cosine similarity to the query.

    Args:
        html_file (str): Path to HTML file.
        leaf_embeddings (list[np.array]): List of embeddings for leaf texts.
        query_embedding (np.array): Query embedding vector.
        top_n (int): Number of top branches to return.

    Returns:
        list of dict: Each dict contains:
            {
                "branch_html": str,
                "average_embedding": np.array,
                "leaf_texts": list[str],
                "similarity": float
            }
    """
    branch_data = compute_branch_embeddings_with_leaves(html_file, leaf_embeddings)

    if not branch_data:
        return []

    for branch in branch_data:
        branch["similarity"] = cosine_similarity(query_embedding, branch["average_embedding"])

    # Sort branches by similarity descending
    branch_data.sort(key=lambda x: x["similarity"], reverse=True)

    return branch_data[:top_n]


In [6]:
query = "What is TechVision’s security record?"
query_embedding = get_embedding(client, text=query)

In [7]:
top_branches = find_top_branches("about.html", leaf_embeddings, query_embedding, top_n=10)

for i, branch in enumerate(top_branches):
    print(f"Rank {i+1}:")
    print("Similarity:", branch["similarity"])
    print("Branch HTML:", branch["branch_html"])
    print("Leaf texts:", branch["leaf_texts"])
    print()


Rank 1:
Similarity: 0.6902452688942217
Branch HTML: <h1 class="page-title">About TechVision</h1>
Leaf texts: ['About TechVision']

Rank 2:
Similarity: 0.5996240670410955
Branch HTML: <span class="logo-text">TechVision</span>
Leaf texts: ['TechVision']

Rank 3:
Similarity: 0.5996240670410955
Branch HTML: <h3 class="footer-title">TechVision</h3>
Leaf texts: ['TechVision']

Rank 4:
Similarity: 0.5914024293601443
Branch HTML: <p>© 2025 TechVision. All rights reserved.</p>
Leaf texts: ['© 2025 TechVision. All rights reserved.']

Rank 5:
Similarity: 0.5097741600488883
Branch HTML: <p class="mission-description">
                            At TechVision, we believe technology should empower, not complicate. 
                            Our mission is to deliver innovative solutions that transform how businesses 
                            operate, helping them thrive in an increasingly digital world.
                        </p>
Leaf texts: ['At TechVision, we believe technology should empo