### Chunking
#### Data insights
Chunking is crucial for a RAG system. Depending on the data source, there is no gold standard for chosing methods and sizes.
Therefore, we first try to gain insights about our data.

In [21]:
import json

file_path = "../2_datasets/expert-dataset.json"
with open(file_path, 'r', encoding="utf-8") as f:
    data = json.load(f)

words_sum = 0 # summary of words in contexts
context_pieces = 0 # number of context pieces in expert data

context_max_length = 0 # minimal length of context pieces
context_min_length = 1000 # mixmum length of context pieces

max_num_context_for_question = 0 # maximum number of context pieces needed for a question

for js in data:
    contextArray = js["context"]
    if len(contextArray) > max_num_context_for_question:
        max_num_context_for_question = len(contextArray)
    for ctx in contextArray:
        length = len(ctx["page_content"].split())
        words_sum += length
        context_pieces += 1
        if context_max_length < length:
            context_max_length = length
        if context_min_length > length:
            context_min_length = length

avg_ctx_pieces = context_pieces / len(data)
avg_ctx_length = round(words_sum/context_pieces)

print("Anotated context has a total of", words_sum, "words.")
print("Thats an average of rounded",avg_ctx_length , "words in contexts.")
print("The longest context piece is", context_max_length, "words long.")
print("The shortest context piece is", context_min_length, "words short.")
print("On average, a question can be answered with", avg_ctx_pieces,"context pieces.")
print("The max number of context pieces to answer a question is", max_num_context_for_question,".")

Anotated context has a total of 18027 words.
Thats an average of rounded 237 words in contexts.
The longest context piece is 848 words long.
The shortest context piece is 30 words short.
On average, a question can be answered with 1.4339622641509433 context pieces.
The max number of context pieces to answer a question is 6 .


### HTML to Markdown
We load our HTML data, remove irrelevant tags and replace e.g. images and icons. Utilizing the langchain **HTML2TextTransformer** we then process our HTML to Markdown. Results can be found in the 'markdown' directory.

In [None]:
from bs4 import BeautifulSoup
import os
from langchain.docstore.document import Document
from langchain_community.document_transformers import Html2TextTransformer

def processHTMLContents(content):
    for img in content.find_all('img'):
        path = img['src']

        # Replacing images and icons with custom tags linked to source
        icons_path = ""
        images_path = ""
        if path.startswith(icons_path):        
            img.replace_with('![Icon]('+path+')')
        
        if path.startswith(images_path):
            img.replace_with('![Image]('+path+')')

    # Replace X icons in a checkbox table
    for tag in content.find_all('i', class_='fal fa-check fa-2x'):
        tag.replace_with('X')
        
    # Remove noise like header and button elements
    for header in content.find_all('header'):
        header.decompose()

    for btn in content.find_all('button'):
        btn.decompose()

# Returns Markdown elements
def parse(file_path):
    directory = "./html"
    file_path_ = os.path.join(directory, file_path)
    with open(file_path_, encoding="utf8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "lxml")
    content = soup.find(class_="TopicViewer_container") # Content of HTML

    processHTMLContents(content)

    content.text.replace('\xa0', '')
    
    # Add document source as metadata
    doc = Document(page_content=str(soup), metadata={"source": file_path})

    # Transform to markdown
    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents([doc])

    return docs_transformed

In [10]:
import os

directory = "./html"
files = os.listdir(directory)
output_directory = "./markdown/"

# Filter HTML files
html_files = [file for file in files if file.endswith('.html')]

document_chunks = []

for file in html_files:
    chunks = parse(file)
    document_chunks.append(chunks)
    text_file_name = file.replace('.html', '.md')
    output_path = output_directory + text_file_name
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(chunks[0].page_content)

### Splitting markdown files by headers
With the langchain **MarkdownHeaderTextSplitter** we can split our documents without loosing relevant context in chunks.
We append header text information to our metadata.

In [None]:
import os
from langchain.docstore.document import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_dir = './markdown/'
md_files = os.listdir(markdown_dir)

chunks = []

for md in md_files:
    md_path = markdown_dir + md
    with open(md_path, encoding="utf8") as f:
        md_txt = f.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    for chunk in markdown_splitter.split_text(md_txt):
       meta = chunk.metadata
       meta["source-document"] = md
       chunks.append(Document(page_content=chunk.page_content, metadata=meta))

In [None]:
print(f"Splitting HTML files into {len(chunks)} markdown chunks.")

Splitting HTML files into 521 markdown chunks.


In [None]:
import statistics

max = 0
min = 1000
avg = 0
c_list = []

for item in chunks:
    c = item.page_content
    chunks = c.split()
    num_chunks = len(chunks)
    if num_chunks > max:
        max = num_chunks
    if num_chunks < min:
        min_c = chunks
        min = num_chunks
    avg += num_chunks
    c_list.append(num_chunks)
    
print("Min:", min)
print("Max:", max)
print("Average:", avg/len(chunks))
print("Median:", statistics.median(c_list))

Min: 4
Max: 1874
Average: 222.43186180422265
Median: 134


While a minimum word length of *4* might not be able to contain relevant information we see, it is a good decision to not split our data with a static chunk size.
However, we see medium and median are pretty close in size. 

In [18]:
from  langchain.schema import Document
import json
from typing import Iterable

#https://github.com/langchain-ai/langchain/issues/3016

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

We save our processed chunks to *chunks.jsonl*

In [None]:
chunks_path = "./chunks.jsonl"
save_docs_to_jsonl(chunks, chunks_path)