In [3]:
# 1. Import the HTML splitter
from langchain_text_splitters import HTMLHeaderTextSplitter
#•	You're importing a tool from LangChain that can split HTML documents based on header tags like <h1>, <h2>, etc.

# 2. Create an example HTML string
html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        ...
    </div>
</body>
</html>
"""
# •	This is some fake HTML content that mimics what you’d find on a real website.
# •	It includes various headers (<h1>, <h2>, <h3>) and paragraphs (<p>).

# 3. Define which headers you want to split on
headers_to_split_on=[
    ("h1","Header 1"),
    ("h2","Header 2"),
    ("h3","Header 3")
]
# •	You tell the splitter:
# o	When you see an <h1> tag, call it “Header 1”.
# o	When you see an <h2> tag, call it “Header 2”.
# o	And so on...

# 4. Create the splitter and split the HTML string
html_splitter=HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits=html_splitter.split_text(html_string)
# •	You create a splitter that knows how to handle those headers.
# •	Then you give it the HTML string to split into meaningful chunks.
#     •	html_header_splits will now contain a list of sections, each with header info and the related text.
#     ✅ Result: The content is now split into sections like:
#     •	Header 1: Foo → Text: "Some intro text about Foo."
#     •	Header 2: Bar main section → Text: ...
#     •	Header 3: Bar subsection 1 → Text: ...

# 5. Use the same logic but for a real webpage
url = "https://plato.stanford.edu/entries/goedel/"
# •	This is the URL of a real webpage (an entry about Gödel from the Stanford Encyclopedia of Philosophy).

# 6. Update headers to include <h4>
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]
# •	You now want to go deeper and also include <h4> headers.

# 7. Create the splitter and split the webpage
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)
# •	This time, instead of passing an HTML string, you pass a URL.
# •	It downloads the webpage, finds all the headers, and splits the page into chunks based on those headers.
# ✅ Result: You get organized sections from the Gödel article, each labeled with the proper header.


