#### How to split by HTML Header

HTMLHeaderTextSplitter is a "structure-aware" chunker that splits text at the HTML element level and adds metadata for each ehader "relevant" to any given chunk.It can return chunks element by element or combine elements with the same metadata, with the objectives of 

(a)Keeping relatedtext grouped (more or less) semantically and

(b)Preserving context-rich information encoded in document structures. It can be used with other text splitters as part of a chunking pipeline.

In [1]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_string= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Spice Purchase Report</title>
    <style>
        /* General Styling */
        body {
            font-family: 'Arial', sans-serif;
            background-color: #f4f4f4;
            margin: 0;
            padding: 20px;
            color: #333;
        }
        
        h2 {
            text-align: center;
            color: #D32F2F; /* Red Heading */
            margin-bottom: 10px;
        }

        h3 {
            color: #D32F2F; /* Red Subheading */
            border-bottom: 2px solid #D32F2F;
            padding-bottom: 5px;
            margin-top: 30px;
        }

        /* Table Styling */
        table {
            width: 100%;
            border-collapse: collapse;
            background: white;
            margin-top: 15px;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1);
        }

        th, td {
            padding: 12px;
            text-align: center;
            border-bottom: 1px solid #ddd;
        }

        th {
            background-color: #D32F2F; /* Red Header */
            color: white;
            font-weight: bold;
        }

        tr:nth-child(even) {
            background-color: #f2f2f2;
        }

        tr:hover {
            background-color: #ddd;
            transition: 0.3s;
        }

        /* Back Button */
        .back-btn {
            display: block;
            width: 150px;
            margin: 30px auto;
            padding: 10px 20px;
            background-color: #FF5722;
            color: #FFFFFF;
            border: none;
            border-radius: 5px;
            cursor: pointer;
            text-align: center;
            font-size: 16px;
            text-decoration: none;
        }

        .back-btn:hover {
            background-color: #E64A19;
        }

        /* Responsive Design */
        @media screen and (max-width: 768px) {
            table {
                width: 100%;
                display: block;
                overflow-x: auto;
            }

            th, td {
                white-space: nowrap;
            }
        }
    </style>
</head>
<body>
    <h2>Spice Purchase Report</h2>

    <!-- Kashmiri Chilli -->
    <h3>Kashmiri Chilli</h3>
    <table>
        <thead>
            <tr>
                <th>Material</th>
                <th>Total Quantity</th>
                <th>Weighted Average SHU</th>
                <th>Weighted Average Color</th>
            </tr>
        </thead>
        <tbody>
            {% for i in range(kashmiri_shu_data[0]|length) %}
            <tr>
                <td>{{ kashmiri_shu_data[0][i] }}</td>
                <td>{{ kashmiri_shu_data[1][i] }}</td>
                <td>{{ kashmiri_shu_data[2][i] }}</td>
                <td>{{ kashmiri_color_data[2][i] }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>

    <!-- Turmeric -->
    <h3>Turmeric</h3>
    <table>
        <thead>
            <tr>
                <th>Material</th>
                <th>Total Quantity</th>
                <th>Weighted Average Curcuminoid</th>
            </tr>
        </thead>
        <tbody>
            {% for i in range(turmeric_data[0]|length) %}
            <tr>
                <td>{{ turmeric_data[0][i] }}</td>
                <td>{{ turmeric_data[1][i] }}</td>
                <td>{{ turmeric_data[2][i] }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>

    <!-- Red Chilli -->
    <h3>Red Chilli</h3>
    <table>
        <thead>
            <tr>
                <th>Material</th>
                <th>Total Quantity</th>
                <th>Weighted Average SHU</th>
                <th>Weighted Average Color</th>
            </tr>
        </thead>
        <tbody>
            {% for i in range(red_chilli_shu_data[0]|length) %}
            <tr>
                <td>{{ red_chilli_shu_data[0][i] }}</td>
                <td>{{ red_chilli_shu_data[1][i] }}</td>
                <td>{{ red_chilli_shu_data[2][i] }}</td>
                <td>{{ red_chilli_color_data[2][i] }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>

    <!-- Coriander -->
    <h3>Coriander</h3>
    <table>
        <thead>
            <tr>
                <th>Material</th>
                <th>Total Quantity</th>
                <th>Weighted Average Volatile Oil</th>
            </tr>
        </thead>
        <tbody>
            {% for i in range(coriander_data[0]|length) %}
            <tr>
                <td>{{ coriander_data[0][i] }}</td>
                <td>{{ coriander_data[1][i] }}</td>
                <td>{{ coriander_data[2][i] }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>

    <!-- Pepper -->
    <h3>Pepper</h3>
    <table>
        <thead>
            <tr>
                <th>Material</th>
                <th>Total Quantity</th>
                <th>Weighted Average Volatile Oil</th>
            </tr>
        </thead>
        <tbody>
            {% for i in range(pepper_data[0]|length) %}
            <tr>
                <td>{{ pepper_data[0][i] }}</td>
                <td>{{ pepper_data[1][i] }}</td>
                <td>{{ pepper_data[2][i] }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>

    <!-- Back Button -->
    <a href="javascript:history.back()" class="back-btn">← Back</a>

</body>
</html>

"""

headers_to_split_on=[
    ("h1","Header1"),
    ("h2","Header2"),
    ("h3","Header3")
]

html_splitter=HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits=html_splitter.split_text(html_string)
html_header_splits

[Document(metadata={}, page_content='Kashmiri Chilli Turmeric Red Chilli Coriander Pepper Back Button'),
 Document(metadata={'Header2': 'Spice Purchase Report'}, page_content='Spice Purchase Report'),
 Document(metadata={'Header2': 'Spice Purchase Report', 'Header3': 'Kashmiri Chilli'}, page_content='Kashmiri Chilli'),
 Document(metadata={'Header2': 'Spice Purchase Report', 'Header3': 'Kashmiri Chilli'}, page_content='Material  \nTotal Quantity  \nWeighted Average SHU  \nWeighted Average Color  \n{% for i in range(kashmiri_shu_data[0]|length) %} {% endfor %}  \n{{ kashmiri_shu_data[0][i] }}  \n{{ kashmiri_shu_data[1][i] }}  \n{{ kashmiri_shu_data[2][i] }}  \n{{ kashmiri_color_data[2][i] }}'),
 Document(metadata={'Header2': 'Spice Purchase Report', 'Header3': 'Turmeric'}, page_content='Turmeric'),
 Document(metadata={'Header2': 'Spice Purchase Report', 'Header3': 'Turmeric'}, page_content='Material  \nTotal Quantity  \nWeighted Average Curcuminoid  \n{% for i in range(turmeric_data[0]|l

In [2]:
url ="https://www.dsgroup.com/"
headers_to_split_on=[
    ("h1","Header1"),
    ("h2","Header2"),
    ("h3","Header3")
]
html_splitter=HTMLHeaderTextSplitter(headers_to_split_on)
html_splits=html_splitter.split_text_from_url(url)
html_splits

 Document(metadata={'Header1': "DS Group, Home to India's Favourite Brands"}, page_content="DS Group, Home to India's Favourite Brands"),
 Document(metadata={'Header1': "DS Group, Home to India's Favourite Brands"}, page_content='Capturing the hearts of customers by placing them at the heart of everything we do.  \nMilestones  \nMore than 90 years of success, driven by countless achievements.  \nKnow more'),
 Document(metadata={'Header1': "DS Group, Home to India's Favourite Brands", 'Header2': 'Sustainability'}, page_content='Sustainability'),
 Document(metadata={'Header2': 'Sustainability'}, page_content='DS Group illustrates various aspects of its Sustainable Development journey comprising of governance, risks and opportunities, strategy, performance, key highlights and achievements, qualitative and quantitative data on various economic, environmental and social aspects. The report conforms to GRI Standards, ‘In Accordance – Core‘ option. GRI (Global Reorting Initiative) Standards a