# Create your Index

This notebook runs through the creation of an Azure AI Search index using locally stored documents. It uses Beautiful Soup to parse HTML documents, and then LangChain's UnstructuredHTMLLoader and CharacterTextSplitter to chunk the data. The sizes of these chunks can be configured.

Vector embeddings are created using text-embedding-ada-002. Currently only the content chunks are embedded, but more can be added, such as title, if desired (see the commented fields in the index definition for examples).

In [None]:
import os
import hashlib

import openai
from openai import AzureOpenAI, BadRequestError
from tqdm import tqdm
from dotenv import load_dotenv
from bs4 import BeautifulSoup

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, Document, HTMLHeaderTextSplitter, TokenTextSplitter
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    CorsOptions,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchProfile,
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,
)

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


Make sure to set up a .env file first (see .env.template for an example)

In [None]:
load_dotenv()  # take environment variables from .env.

Create a document out of the page, removing all of the table data

Split a document into sections by h2 header.

Within each section, check if there are tables. If there are tables, remove all attributes from the HTML tags, but leave the table tags intact - to improve the LLMs ability to parse the structure of the table.

For the parts of the document that are not tables - process as normal, chunking as normal with overlap.

Parse all the tables into Documents

In [None]:
@retry(
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=10)
)
def generate_response(client, content, prompt, deployment_name):
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": content}
    ]

    try:
        response = client.chat.completions.create(
            model=deployment_name,
            messages=messages,
            temperature=0.0,
        )
    except BadRequestError as err:
        print(err.message)
        print(content)
        return None
        

    return response.choices[0].message.content

In [None]:
SUMMARY_PROMPT = "Summarize the given text below into 4 sentences or fewer. Use Bullet points to summarize multiple data points efficiently. If the data is in the form of an HTML table, include as much detailed information from the table as possible. \n\nText:"

TABLE_EXTRACTION_PROMPT = "Given the below table, extract all of the relevant information contained within, and reformat it into a bulleted list. Each row should be one line, under 50 words apiece. Be as concise as possible while including all relevant figures, prices, and limitations of information in the row. \n\nTable:"


CLIENT = AzureOpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),
    api_version = "2023-12-01-preview",
    azure_endpoint = os.getenv("OPENAI_ENDPOINT")
)
DEPLOYMENT_NAME = 'gpt-35-turbo'

In [None]:
def custom_html_chunker(filepath):
    table_docs = []
    with open(filepath, "r", encoding='utf-8', errors='ignore') as f:
        soup = BeautifulSoup(f, "html.parser")
    if soup.title is not None:
        title = soup.title.string
    else:
        title = filepath.split("/")[-1]
    if soup.find("meta") is not None:
        metatags = soup.find_all('meta')
        for tag in metatags:
            if tag.get('name', None) == 'pagePath':
                source = tag['content']

            if tag.get('name', None) == 'updateDate':
                last_updated = tag['content']
    else:
        source = filepath
        last_updated = None
    tbls = soup.find_all("table")

    if len(soup.select("#universal-content")) > 0:
        sections = soup.select("#universal-content")[0].prettify().split("<h2")
    else:
        sections = soup.prettify().split("<h2")

    # Split doc by H2, look for tables in each section, create docs of each table
    for section in sections:
        section = "<h2" + section
        section_soup = BeautifulSoup(section, "html.parser", from_encoding='utf-8')
        tables = section_soup.find_all("table")
        if len(tables) > 0:
            for table in tables:
                del table.attrs
                if table.find_previous('h3'):
                    header = table.find_previous('h3').text.strip()
                    if table.find_previous('h2'):
                        header2 = table.find_previous('h2').text.strip()
                        metadata = {"title": title, "source": source, "Header 3": header, "Header 2": header2}
                    metadata = {"title": title, "source": source, "Header 3": header}
                else:
                    if table.find_previous('h2'):
                        header = table.find_previous('h2').text
                        metadata = {"title": title, "source": source, "Header 2": header.strip()}
                    else:
                        metadata = {"title": title, "source": source}
                for tag in table.recursiveChildGenerator():
                    if hasattr(tag, 'attrs'):
                        tag.attrs = {}
                page_content = table.prettify().replace("<p>", "").replace("</p>", "").replace("<br/>", "").replace("<br>", "")
                
                summary = generate_response(CLIENT, page_content[:10000], TABLE_EXTRACTION_PROMPT, DEPLOYMENT_NAME)
                if summary is not None: 
                    metadata["summary"] = summary
                else:
                    metadata["summary"] = "No summary generated"
                
                table_doc = Document(page_content= page_content, metadata=metadata)
                table_docs.append(table_doc)



    tbls = soup.find_all("table")
    ## Remove tables from soup
    for tbl in tbls:
        tbl.decompose()

    # Chunk the table-less content

    headers_to_split_on = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3")
    ]

    html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
    html_header_splits = html_splitter.split_text(soup.prettify())

    # Sometimes the header splits results in too large of documents for summarizing
    length_checked_docs = []
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=4000, chunk_overlap=400)
    for doc in html_header_splits:
        if(len(doc.page_content)/4) > 5000:
            split_docs = text_splitter.split_documents([doc])
            for split_doc in split_docs:
                length_checked_docs.append(split_doc)
        else:
            length_checked_docs.append(doc)

    for split in length_checked_docs:
        split.metadata["source"] = source
        split.metadata["title"] = title
        split.metadata["last_updated"] = last_updated
        split.metadata["summary"] = generate_response(CLIENT, split.page_content, SUMMARY_PROMPT, DEPLOYMENT_NAME)

    return length_checked_docs + table_docs
    


## Use BeautifulSoup to parse HTML
This section assumes your data is stored locally for the time being - this will need to be adjusted for scale.
Set your root_dir to where you have saved your documents to be indexed.

In [None]:
from tqdm import tqdm

root_dir = "./data/"

# Loop through the folders
docs = []
failed = []
cnt = 0
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in tqdm(filenames):
        #try:
        if file.endswith('.pdf'):
            loader = PyPDFLoader(os.path.join(dirpath, file))
            docs.extend(loader.load_and_split())
        elif file.endswith('.docx'):
            loader = Docx2txtLoader(os.path.join(dirpath, file))
            docs.extend(loader.load_and_split())
        elif file.endswith('.txt'):
            loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
            docs.extend(loader.load_and_split()) 
        elif file.endswith('.html'):
            try:
                docs += custom_html_chunker(os.path.join(dirpath, file))
            except Exception as e:
                print(file)
                print(e)
                failed.append(os.path.join(dirpath, file))
            #print("File #" + str(cnt) + " " + os.path.join(dirpath, file))
            cnt = cnt + 1

        else:
        # Handle other file types
            pass
        # except Exception as e:
        #     print(e)

## Define the Search Index

This method creates an Azure AI Search Index using the azure.search.documents SDK. These fields are configurable, but you will need to make sure they are then updated in the `QnA Promptflow` `flow.dag.yaml` file to match, along with any of the `.py` files that rely on those fields.

In [None]:
# Create Search Index
def create_acs_index(
    service_endpoint, index_name, key
):
    credential = AzureKeyCredential(key)

    # Create a search index
    index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(
            name="page_content",
            type=SearchFieldDataType.String,
            searchable=True,
            retrievable=True,
        ),
        SearchableField(
            name="title",
            type=SearchFieldDataType.String,
            searchable=True,
            retrievable=True,
        ),
        SearchableField(
            name="headers",
            type=SearchFieldDataType.String,
            searchable=True,
            retrievable=True,
        ),
        SearchableField(
            name="summary",
            type=SearchFieldDataType.String,
            searchable=True,
            retrievable=True,
        ),
        # SearchableField(
        #     name="url",
        #     type=SearchFieldDataType.String,
        #     filterable=True,
        #     searchable=False,
        #     retrievable=True,
        # ),
        SearchableField(
            name="last_updated",
            type=SearchFieldDataType.DateTimeOffset,
            retrievable=True,
        ),
        SearchableField(
            name="source",
            type=SearchFieldDataType.String,
            filterable=True,
            searchable=False,
            retrievable=True,
        ),
        SearchField(
            name="contentVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=int(1536),
            vector_search_profile="my-vector-search-profile",
        ),
        # These are example additional vector fields or string fields that can be added to the index
        # SearchField(
        #     name="contentTitle",
        #     type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        #     searchable=True,
        #     vector_search_dimensions=int(dimension),
        #     vector_search_profile="my-vector-search-profile",
        # ),
        SearchField(
            name="contentSummary",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=int(1536),
            vector_search_profile="my-vector-search-profile",
        ),
        # SearchField(
        #     name="contentDescription",
        #     type=SearchFieldDataType.String,
        #     sortable=True,
        #     filterable=True,
        #     facetable=True,
        #     analyzer_name=analyzer,
        # ),
    ]

    # This configures the vector search algorithm. The parameters are set to the defaults for the algorithm.
    vector_search = VectorSearch(
        algorithms=[
            HnswVectorSearchAlgorithmConfiguration(
                name="my-vector-config",
                parameters=HnswParameters(
                    m=4,
                    ef_construction=int(400),
                    ef_search=int(400),
                    metric="cosine",
                ),
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="my-vector-search-profile", algorithm="my-vector-config"
            )
        ],
    )

    semantic_config = SemanticConfiguration(
        name="vzw-semantic-config",
        prioritized_fields=PrioritizedFields(
            prioritized_content_fields=[
                SemanticField(field_name="page_content"),
                SemanticField(field_name="summary")
            ],
            title_field=SemanticField(field_name="title"),            
        ),
    )

    # Create the semantic settings with the configuration
    semantic_settings = SemanticSettings(configurations=[semantic_config])


    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []

    # Create the search index with the semantic, tokenizer, and filter settings
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings,
        scoring_profiles=scoring_profiles,
        cors_options=cors_options,
    )
    result = index_client.create_or_update_index(index)
    print(result)

Set your index name, credentials, and create the index

In [None]:

search_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
endpoint=os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")

index_name = "vbg-index-summaries-v2"

create_acs_index(service_endpoint=endpoint, index_name=index_name, key=search_key)

This method connects to the Azure OpenAI service

In [None]:

@retry(wait=wait_random_exponential(min=5, max=90), stop=stop_after_attempt(6))
def get_embedding(client, content):
    return client.embeddings.create(model=os.environ.get('OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME'), input=content).data[0].embedding

## Generate the embeddings
This took about 5 minute for the test dataset

In [None]:

client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
    api_version=os.getenv('OPENAI_API_VERSION'),
    azure_endpoint =os.getenv('OPENAI_ENDPOINT'),
    )
# Add embeddings
for doc in tqdm(docs):
    if len(doc.page_content) < 22000:
        doc.metadata["embedding"] = get_embedding(client, doc.page_content)
    else:
        doc.metadata["embedding"] = get_embedding(client, doc.page_content[:22000])
    if doc.metadata["summary"] is not None:
        doc.metadata["summary_embedding"] = get_embedding(client, doc.metadata["summary"])
    else:
        doc.metadata["summary_embedding"] = get_embedding(client, "No summary generated")

# Upload to Azure AI Search

This takes about 3-5 minutes for the test dataset, very dependent on networking

In [None]:
props = ['Header 1', 'Header 2', 'Header 3', 'source', 'title', 'last_updated', 'summary', 'embedding', 'summary_embedding']
for doc in docs:
    for key in props:
        if key in doc.metadata:
            if doc.metadata[key] is None:
                if key == "summary_embedding":
                    doc.metadata[key] = get_embedding(client, "No summary generated")
                else:
                    doc.metadata[key] = ""
        else:
            doc.metadata[key] = ""

In [None]:
#docs[5].metadata['last_updated']

In [None]:

credential = AzureKeyCredential(search_key)

header_options = [
    "Header 1",
    "Header 2",
    "Header 3"
]

search_client = SearchClient(
    endpoint=endpoint, index_name=index_name, credential=credential
)
documents = []
for i, chunk in enumerate(tqdm(docs)):
    # summary = generate_summary(str(chunk["content"]), chat_model_name, temperature)
    id = hashlib.md5(chunk.page_content.encode()).hexdigest()
    headers = []
    for header in header_options:
        if header in chunk.metadata:
            headers.append(chunk.metadata[header])

    input_data = {
        "id": str(id),
        "title": str(chunk.metadata["title"]),
        "page_content": str(chunk.page_content),
        "source": str(chunk.metadata["source"]),
        "last_updated": str(chunk.metadata["last_updated"]),
        "contentVector": chunk.metadata["embedding"],
        "contentSummary": chunk.metadata["summary_embedding"],
        "summary": str(chunk.metadata["summary"]),
        "headers": str(headers),
    }

    #print(input_data)

    documents.append(input_data)
    search_client.upload_documents(documents=[input_data])