# Chunks to `.txt` files

This notebook is an evaluation of different chunking methods with different parameters.

In [1]:
import os
import pathlib
from dotenv import load_dotenv, find_dotenv
from tqdm import tqdm
from pathlib import Path

load_dotenv(find_dotenv())

DATA_PATH = os.getenv("DATA_PATH")
SUB_DATA_SET_PATH = os.path.join(DATA_PATH, "aktive_leistungen", "ark")

In [2]:
all_file_paths_generator = Path(SUB_DATA_SET_PATH).rglob("*.*")
ALL_FILE_PATHS = [str(f) for f in all_file_paths_generator]

### 1. RecursiveCharacterTextSplitter

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader

In [4]:
SAVING_PATH = "Data/Chunks/RecursiveCharacterTextSplitter/"
CHUNK_SIZES = [1000, 1500, 2000]
CHUNK_OVERLAPS = [200, 350, 500]
separators = ["\n{2,}", "(?<=[.?!])\s*\n|\n\s*", "[.!?]"]

rcts_parameters = [(chunk_size, chunk_overlap, separators) for chunk_size, chunk_overlap in zip(CHUNK_SIZES, CHUNK_OVERLAPS)]
rcts_parameters

[(1000, 200, ['\n{2,}', '(?<=[.?!])\\s*\n|\n\\s*', '[.!?]']),
 (1500, 350, ['\n{2,}', '(?<=[.?!])\\s*\n|\n\\s*', '[.!?]']),
 (2000, 500, ['\n{2,}', '(?<=[.?!])\\s*\n|\n\\s*', '[.!?]'])]

In [5]:
with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1].split(".")[0]     # file name without extension
        iterator.set_postfix_str(f"Processing {file_name}")
        
        # Loading the document
        docs = None
        if file_path.endswith(".pdf"):
            docs = PyPDFLoader(file_path).load()
        elif file_path.endswith(".docx"):
            docs = Docx2txtLoader(file_path).load()
        
        for paramters in rcts_parameters:
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=paramters[0],
                chunk_overlap=paramters[1],
                separators=paramters[2],
                is_separator_regex=True,
                strip_whitespace=True,
            )   
            
            chunks = splitter.split_documents(docs)
            
            # Saving the chunks
            chunk_path = os.path.join(SAVING_PATH, str(paramters[0]), str(paramters[0])+"_"+file_name+".txt")
            with open(chunk_path, "w", encoding="utf-8") as f:
                f.write("=" * 150)
                f.write("\n")
                f.write(f"file:\t\t\t{file_path}\n")
                f.write(f"chunk_size:\t\t{paramters[0]}\n")
                f.write(f"chunk_overlap:\t{paramters[1]}\n")
                f.write(f"separators:\t\t{paramters[2]}\n")
                f.write(f"n_chunks:\t\t{len(chunks)}\n")
                f.write("=" * 150)
                f.write("\n\n\n\n")
                for i, chunk in enumerate(chunks, 1):
                    f.write(f"Chunk #{i}\n")
                    f.write("-" * 150)
                    f.write("\n")
                    f.write(chunk.page_content)
                    if i != len(chunks):
                        f.write("\n\n\n\n")


  0%|          | 0/17 [00:00<?, ?it/s, Processing aktive_leistungen_bei_darlehensweiser_passiver_leistungsgewaehrung]

100%|██████████| 17/17 [00:03<00:00,  5.50it/s, Processing zahlungsmodalitaeten]                                     


## Unstructured

In [25]:
from dotenv import load_dotenv, find_dotenv
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.partition.doc import partition_doc
from unstructured.documents.elements import Image


load_dotenv(find_dotenv())

POPPLER_PATH = os.getenv("POPPLER_PATH")
TESSERACT_PATH = os.getenv("TESSERACT_PATH")

In [23]:
SAVING_PATH = "Data/Chunks/"
CHUNK_SIZES = [1000, 1500, 2000, 3000]
CHUNKING_STRATEGY = ["basic", "by_title"]
MODES = ["naive", "advanced"]
MAX_CHARACTERS = [i+1000 for i in CHUNK_SIZES]  # only necessary in 2nd round
COMBINE_TEXT_UNDER_N_CHARS = [int(i*2/3) for i in CHUNK_SIZES]  # only necessary in 2nd round
STRATEGY = "hi_res"
LANGUAGES = ["deu"]

In [14]:
def part_pdf(file_path):
    return partition_pdf(
        filename=file_path,
        strategy=STRATEGY,
        languages=LANGUAGES,
    )

def part_docx(file_path):
    return partition_docx(
        filename=file_path,
        strategy=STRATEGY,
        languages=LANGUAGES,
    )

def part_doc(file_path):
    return partition_doc(
        filename=file_path,
        strategy=STRATEGY,
        languages=LANGUAGES,
    )

### 2. Basic Chunking

In [8]:
OVERLAP = [int(i/5) for i in CHUNK_SIZES]

In [29]:
# naive
chunking_strategy = CHUNKING_STRATEGY[0].title()
mode = MODES[0].title()

with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing {file_name}")
        
        elements = None
        if file_name.endswith(".pdf"):
            elements = part_pdf(file_path)
        elif file_name.endswith(".docx"):
            elements = part_docx(file_path)
        elif file_name.endswith(".doc"):
            elements = part_doc(file_path)
        reduced_elements = [element for element in elements if not isinstance(element, (Image))]
        
        for chunk_size, overlap in zip(CHUNK_SIZES, OVERLAP):
            iterator.set_postfix_str(f"Processing {file_name} - {chunk_size}")
            
            chunks = chunk_elements(
                elements=reduced_elements,
                max_characters=chunk_size,
                overlap=overlap,
                overlap_all=True,
            )
            
            # Saving the chunks
            chunk_path = os.path.join(SAVING_PATH, chunking_strategy, mode, str(chunk_size))
            if not os.path.exists(chunk_path):
                os.makedirs(chunk_path)
            
            with open(os.path.join(chunk_path, str(chunk_size)+"_"+file_name+".txt"), "w", encoding="utf-8") as f:
                f.write("=" * 150)
                f.write("\n")
                f.write(f"file:\t\t\t\t{file_path}\n")
                f.write(f"chunk_size:\t\t\t{chunk_size}\n")
                f.write(f"overlap:\t\t\t{overlap}\n")
                f.write(f"loading_strategy:\t{STRATEGY}\n")
                f.write(f"chunking_strategy:\t{chunking_strategy}\n")
                f.write(f"mode:\t\t\t\t{mode}\n")
                f.write(f"n_chunks:\t\t\t{len(chunks)}\n")
                f.write("=" * 150)
                f.write("\n\n\n\n")
                for i, chunk in enumerate(chunks, 1):
                    f.write(f"Chunk #{i}\n")
                    f.write("-" * 150)
                    f.write("\n")
                    f.write(chunk.text)
                    if i != len(chunks):
                        f.write("\n\n\n\n")

100%|██████████| 17/17 [04:26<00:00, 15.66s/it, Processing zahlungsmodalitaeten.pdf - 3000]                                      


In [33]:
# advanced
chunking_strategy = CHUNKING_STRATEGY[0].title()
mode = MODES[1].title()

with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing {file_name}")
        
        elements = None
        if file_name.endswith(".pdf"):
            elements = part_pdf(file_path)
        elif file_name.endswith(".docx"):
            elements = part_docx(file_path)
        elif file_name.endswith(".doc"):
            elements = part_doc(file_path)
        reduced_elements = [element for element in elements if not isinstance(element, (Image))]
        
        for max_characters, chunk_size, overlap in zip(MAX_CHARACTERS, CHUNK_SIZES, OVERLAP):
            iterator.set_postfix_str(f"Processing {file_name} - {chunk_size}")
            
            chunks = chunk_elements(
                elements=reduced_elements,
                max_characters=max_characters,
                new_after_n_chars=chunk_size,
                overlap=overlap,
                overlap_all=True,
            )
            
            # Saving the chunks
            chunk_path = os.path.join(SAVING_PATH, chunking_strategy, mode, str(chunk_size))
            if not os.path.exists(chunk_path):
                os.makedirs(chunk_path)
            
            with open(os.path.join(chunk_path, str(chunk_size)+"_"+file_name+".txt"), "w", encoding="utf-8") as f:
                f.write("=" * 150)
                f.write("\n")
                f.write(f"file:\t\t\t\t{file_path}\n")
                f.write(f"max_characters:\t\t{max_characters}\n")
                f.write(f"aimed_chunk_size:\t{chunk_size}\n")
                f.write(f"overlap:\t\t\t{overlap}\n")
                f.write(f"loading_strategy:\t{STRATEGY}\n")
                f.write(f"chunking_strategy:\t{chunking_strategy}\n")
                f.write(f"mode:\t\t\t\t{mode}\n")
                f.write(f"n_chunks:\t\t\t{len(chunks)}\n")
                f.write("=" * 150)
                f.write("\n\n\n\n")
                for i, chunk in enumerate(chunks, 1):
                    f.write(f"Chunk #{i}\n")
                    f.write("-" * 150)
                    f.write("\n")
                    f.write(chunk.text)
                    if i != len(chunks):
                        f.write("\n\n\n\n")

100%|██████████| 17/17 [05:27<00:00, 19.27s/it, Processing zahlungsmodalitaeten.pdf - 3000]                                      


### 3. Chunking by Title

In [34]:
# naive
chunking_strategy = CHUNKING_STRATEGY[1].title()
mode = MODES[0].title()

with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing {file_name}")
        
        elements = None
        if file_name.endswith(".pdf"):
            elements = part_pdf(file_path)
        elif file_name.endswith(".docx"):
            elements = part_docx(file_path)
        elif file_name.endswith(".doc"):
            elements = part_doc(file_path)
        reduced_elements = [element for element in elements if not isinstance(element, (Image))]
        
        for chunk_size in CHUNK_SIZES:
            iterator.set_postfix_str(f"Processing {file_name} - {chunk_size}")
            
            chunks = chunk_by_title(
                elements=reduced_elements,
                max_characters=chunk_size,
            )
            
            # Saving the chunks
            chunk_path = os.path.join(SAVING_PATH, chunking_strategy, mode, str(chunk_size))
            if not os.path.exists(chunk_path):
                os.makedirs(chunk_path)
            
            with open(os.path.join(chunk_path, str(chunk_size)+"_"+file_name+".txt"), "w", encoding="utf-8") as f:
                f.write("=" * 150)
                f.write("\n")
                f.write(f"file:\t\t\t\t{file_path}\n")
                f.write(f"chunk_size:\t\t\t{chunk_size}\n")
                f.write(f"loading_strategy:\t{STRATEGY}\n")
                f.write(f"chunking_strategy:\t{chunking_strategy}\n")
                f.write(f"mode:\t\t\t\t{mode}\n")
                f.write(f"n_chunks:\t\t\t{len(chunks)}\n")
                f.write("=" * 150)
                f.write("\n\n\n\n")
                for i, chunk in enumerate(chunks, 1):
                    f.write(f"Chunk #{i}\n")
                    f.write("-" * 150)
                    f.write("\n")
                    f.write(chunk.text)
                    if i != len(chunks):
                        f.write("\n\n\n\n")

100%|██████████| 17/17 [06:43<00:00, 23.73s/it, Processing zahlungsmodalitaeten.pdf - 3000]                                              


In [37]:
# naive
chunking_strategy = CHUNKING_STRATEGY[1].title()
mode = MODES[1].title()

with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing {file_name}")
        
        elements = None
        if file_name.endswith(".pdf"):
            elements = part_pdf(file_path)
        elif file_name.endswith(".docx"):
            elements = part_docx(file_path)
        elif file_name.endswith(".doc"):
            elements = part_doc(file_path)
        reduced_elements = [element for element in elements if not isinstance(element, (Image))]
        
        for max_characters, chunk_size, combine_under in zip(MAX_CHARACTERS, CHUNK_SIZES, COMBINE_TEXT_UNDER_N_CHARS):
            iterator.set_postfix_str(f"Processing {file_name} - {chunk_size}")
            
            chunks = chunk_by_title(
                elements=reduced_elements,
                max_characters=max_characters,
                new_after_n_chars=chunk_size,
                combine_text_under_n_chars=combine_under,
            )
            
            # Saving the chunks
            chunk_path = os.path.join(SAVING_PATH, chunking_strategy, mode, str(chunk_size))
            if not os.path.exists(chunk_path):
                os.makedirs(chunk_path)
            
            with open(os.path.join(chunk_path, str(chunk_size)+"_"+file_name+".txt"), "w", encoding="utf-8") as f:
                f.write("=" * 150)
                f.write("\n")
                f.write(f"file:\t\t\t\t{file_path}\n")
                f.write(f"max_characters:\t\t{max_characters}\n")
                f.write(f"aimed_chunk_size:\t{chunk_size}\n")
                f.write(f"combine_under:\t\t{combine_under}\n")
                f.write(f"loading_strategy:\t{STRATEGY}\n")
                f.write(f"chunking_strategy:\t{chunking_strategy}\n")
                f.write(f"mode:\t\t\t\t{mode}\n")
                f.write(f"n_chunks:\t\t\t{len(chunks)}\n")
                f.write("=" * 150)
                f.write("\n\n\n\n")
                for i, chunk in enumerate(chunks, 1):
                    f.write(f"Chunk #{i}\n")
                    f.write("-" * 150)
                    f.write("\n")
                    f.write(chunk.text)
                    if i != len(chunks):
                        f.write("\n\n\n\n")

100%|██████████| 17/17 [06:36<00:00, 23.35s/it, Processing zahlungsmodalitaeten.pdf - 3000]                                      
