# Evaluation

## Imports

In [4]:
import os
import datetime
import math
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from tqdm import tqdm
from collections import defaultdict
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from unstructured.partition.docx import partition_docx
from unstructured.partition.doc import partition_doc
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Header, Footer, Image, Table, Title

load_dotenv(find_dotenv())

POPPLER_PATH = os.getenv("POPPLER_PATH")
TESSERACT_PATH = os.getenv("TESSERACT_PATH")
DATA_PATH = os.getenv("DATA_PATH")
SUB_DATA_SET_PATH = os.path.join(DATA_PATH, "aktive_leistungen", "ark")

- `ALL_FILE_PATHS`: is a list of all **absolute** file paths of the files as **strings**
- `ALL_FILE_NAMES`: is a list of all **file names** of the files as **strings** (without the path, just **name + extension**)

- `STRATEGY`: is the strategy, that *unstructured.io* is using to extract the text from the files
- `LANGUAGES`: is a list of languages, that the OCR model is able to detect
- `REMOVABLE_IMAGES`: is a tuple of unstructured-elements, that are removable from the text

In [5]:
all_file_paths_generator = Path(DATA_PATH).rglob("*.*")
ALL_FILE_PATHS = [str(f) for f in all_file_paths_generator]
ALL_FILE_NAMES = list(set(file_path.split("\\")[-1] for file_path in ALL_FILE_PATHS))

STRATEGY = "hi_res"
LANGUAGES = ["deu"] 
REMOVABLE_IMAGES = (Image)

## Partitioning

We are extracting the text from the files with the following strategy:
- `STRATEGY = "hi_res"`: this strategy will identify the layout of the document using `detectron2_onnx`. The advantage of `hi_res` is that it uses the document layout to gain additional information about document elements.
- `LANGUAGES = ["deu"]`: the OCR model is able to detect the following languages: `["deu"]`  

The elements are saved in a dictionary `elements_dict` with the `file_name` as **key** and the `elements` as **value**. The `elements` are a list of different unstructured-types like `Title`, `NarrativeText`, ... .

```json
{
    "file_name": [
        <unstructured.documents.elements.Title at 0x1933c5398d0>,
        <unstructured.documents.elements.NarrativeText at 0x1933c539908>,
        ...
    ],
    ...
}
```

In [6]:
elements_dict = defaultdict(list)

with tqdm(ALL_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]   # file_name -> <file name>.<file extension>
        iterator.set_postfix_str(f"Processing... {file_name}")
        elements = []
        
        if file_path.endswith(".docx"):
            elements = partition_docx(
                filename=file_path,
                strategy=STRATEGY,
                languages=LANGUAGES,
            )
        elif file_path.endswith(".doc"):
            elements = partition_doc(
                filename=file_path,
                strategy=STRATEGY,
                languages=LANGUAGES,
            )
        elif file_path.endswith(".pdf"):
            elements = partition_pdf(
                filename=file_path,
                strategy=STRATEGY,
                languages=LANGUAGES,
            )
        else:
            continue
        
        reduced_elements = [element for element in elements if not isinstance(element, REMOVABLE_IMAGES)]
        elements_dict[file_name] = reduced_elements

100%|██████████| 111/111 [28:47<00:00, 15.57s/it, Processing... CELEX_32016R0679_DE_TXT.pdf]                                          


In [8]:
elements_dict[ALL_FILE_NAMES[0]][:5]

[<unstructured.documents.elements.Title at 0x1808bbf7810>,
 <unstructured.documents.elements.NarrativeText at 0x1808bbf4f10>,
 <unstructured.documents.elements.Title at 0x1808b76e210>,
 <unstructured.documents.elements.NarrativeText at 0x1808bb36cd0>,
 <unstructured.documents.elements.NarrativeText at 0x1808c2b5150>]

## Chunking

Unstructured.io is supporting 2 different types of chunking:
- `basic`: just chunking by amount of characters
- `by_title`: is chunking by title, elements are categorized, so it's possible to chunk by title

- `ALL_CHUNK_SIZES`: is a list of all chunk sizes
- `CHUNK_SIZES`: is a subset of `ALL_CHUNK_SIZES`, that are used to chunk the text
- `MAX_CHARACTERS`: is the **hard cap** of characters, that a chunk can have
- `OVERLAP`: is the **overlap** of characters, that a chunk can have
- `COMBINE_TEXT_UNDER_N_CHARS`: is a list of integers, that are used to **combine chunks**, if they are shorter than the integer
- `MODES`: a list of modes, to chunk the text

In [9]:
# Parameters for chunking

ALL_CHUNK_SIZES = [500, 1000, 1300, 1500, 1800, 2000, 2300, 2500, 3000] 
CHUNK_SIZES = ALL_CHUNK_SIZES[:]
MAX_CHARACTERS = [int(i*(5/3)) for i in CHUNK_SIZES]
OVERLAP = [int(i*(1/5)) for i in CHUNK_SIZES]
COMBINE_TEXT_UNDER_N_CHARS = [int(i*(2/3)) for i in CHUNK_SIZES] 
MODES = ["naive", "advanced"]

MAX_CHARACTERS, CHUNK_SIZES, OVERLAP, COMBINE_TEXT_UNDER_N_CHARS

([833, 1666, 2166, 2500, 3000, 3333, 3833, 4166, 5000],
 [500, 1000, 1300, 1500, 1800, 2000, 2300, 2500, 3000],
 [100, 200, 260, 300, 360, 400, 460, 500, 600],
 [333, 666, 866, 1000, 1200, 1333, 1533, 1666, 2000])

### Basic Chunking

In [10]:
basic_chunks = {}
basic_len = {}

In [11]:
#  naive chunking

mode = MODES[0]
basic_chunks[mode] = defaultdict(dict)
basic_len[mode] = defaultdict(dict)

with tqdm(ALL_FILE_NAMES) as iterator:
    for file_name in iterator:
        elements = elements_dict[file_name]
        
        for chunk_size, overlap in zip(CHUNK_SIZES, OVERLAP):
            iterator.set_postfix_str(f"Processing... {file_name} - {chunk_size}")
            chunks = chunk_elements(
                elements=elements,
                max_characters=chunk_size,
                overlap=overlap,
                overlap_all=True,
                include_orig_elements=True,
            )
            
            lengths = [len(chunk.text) for chunk in chunks]
            avg_chunk_length = int(np.mean(lengths))
            
            basic_len[mode][file_name][chunk_size] = {
                "average": avg_chunk_length, 
                "lengths": lengths
            }
            basic_chunks[mode][file_name][chunk_size] = chunks

100%|██████████| 108/108 [00:02<00:00, 44.84it/s, Processing... FFP2_Formulierungshilfe_20210517.docx - 3000]                                 


In [12]:
# advanced chunking
mode = MODES[1]

basic_chunks[mode] = defaultdict(dict)
basic_len[mode] = defaultdict(dict)

with tqdm(ALL_FILE_NAMES) as iterator:
    for file_name in iterator:
        elements = elements_dict[file_name]
        
        for max_characters, chunk_size, overlap in zip(MAX_CHARACTERS, CHUNK_SIZES, OVERLAP):
            iterator.set_postfix_str(f"Processing... {file_name} - {chunk_size}")
            
            chunks = chunk_elements(
                elements=elements,
                max_characters=max_characters,
                new_after_n_chars=chunk_size,
                overlap=overlap,
                overlap_all=True,
                include_orig_elements=True,
            )
            
            lengths = [len(chunk.text) for chunk in chunks]
            avg_chunk_length = int(np.mean(lengths))
            
            basic_len[mode][file_name][chunk_size] = {
                "average": avg_chunk_length, 
                "lengths": lengths
            }
            basic_chunks[mode][file_name][chunk_size] = chunks

100%|██████████| 108/108 [00:01<00:00, 56.85it/s, Processing... FFP2_Formulierungshilfe_20210517.docx - 3000]                                 


### Title Chunking

In [13]:
title_chunks = {}
title_len = {}

In [14]:
#  naive chunking

mode = MODES[0]
title_chunks[mode] = defaultdict(dict)
title_len[mode] = defaultdict(dict)

with tqdm(ALL_FILE_NAMES) as iterator:
    for file_name in iterator:
        elements = elements_dict[file_name]
        
        for chunk_size in CHUNK_SIZES:
            iterator.set_postfix_str(f"Processing... {file_name} - {chunk_size}")
            
            chunks = chunk_by_title(
                elements=elements,
                max_characters=chunk_size,
                include_orig_elements=True,
            )
            
            lengths = [len(chunk.text) for chunk in chunks]
            avg_chunk_length = int(np.mean(lengths))
            
            title_len[mode][file_name][chunk_size] = {
                "average": avg_chunk_length, 
                "lengths": lengths
            }
            title_chunks[mode][file_name][chunk_size] = chunks

100%|██████████| 108/108 [00:02<00:00, 39.54it/s, Processing... FFP2_Formulierungshilfe_20210517.docx - 3000]                                 


In [15]:
#  naive chunking

mode = MODES[1]
title_chunks[mode] = defaultdict(dict)
title_len[mode] = defaultdict(dict)

with tqdm(ALL_FILE_NAMES) as iterator:
    for file_name in iterator:
        elements = elements_dict[file_name]
        
        for max_characters, chunk_size, combine_under in zip(MAX_CHARACTERS, CHUNK_SIZES, COMBINE_TEXT_UNDER_N_CHARS):
            iterator.set_postfix_str(f"Processing... {file_name} - {chunk_size}")
            
            chunks = chunk_by_title(
                elements=elements,
                max_characters=max_characters,
                new_after_n_chars=chunk_size,
                combine_text_under_n_chars=combine_under,
                include_orig_elements=True,
            )
            
            lengths = [len(chunk.text) for chunk in chunks]
            avg_chunk_length = int(np.mean(lengths))
            
            title_len[mode][file_name][chunk_size] = {
                "average": avg_chunk_length, 
                "lengths": lengths
            }
            title_chunks[mode][file_name][chunk_size] = chunks

100%|██████████| 108/108 [00:01<00:00, 54.83it/s, Processing... FFP2_Formulierungshilfe_20210517.docx - 3000]                                 


## Categorization

Lets find out which chunks are **Good**, **Okay** or **Bad**. If we want to determine which chunks are good okay or bad, we have to see
what the chunk start and end with. We can use the **elements category**, that are categorized either as `Title`, `NarrativeText`, ... .   
If we assume:
- **Good** chunk features:
    - **Start**: `Title` or `Header`
    - **End**: Other element
- **Okay** chunk features:
    - **Start**: Other element
    - **End**: Other element
- **Bad** chunk features:
    - **Start**: Any element (including `Title` or `Header`)
    - **End**: `Title` or `Header`

Depending on how good the elements are categorized, this rules can be more or less accurate. It is possible, that a text is categorized as `NarrativeText`, but it is actually a `Title` or other way around, so the acutally good chunk is a bad chunk now. So we have interpret the results not with 100% accuracy. You can check the bad chunks later on.

In [16]:
def categorize(chunks: dict) -> dict:
    """ Categorize chunks with good, okay and bad chunks and sum them up.
    
    Args:
        chunks (dict): A dictionary of chunks
    
    Returns:
        dict: A dictionary with summed up good, okay and bad chunks
    
    Example:
    >>> t = categorize(title_chunks)
    >>> t["naive"]["total"]
    {1000: array([ 25, 512, 107]),
     1300: array([ 28, 335,  88]),
     1500: array([ 20, 267,  80])},
    """
    
    categories_dict = {}
    
    for mode in MODES:
        categories_dict[mode] = defaultdict(dict)
        
        for file_name in ALL_FILE_NAMES:
            for chunk_size in CHUNK_SIZES:
                iterator.set_postfix_str(f"Processing... {file_name} - {chunk_size}")
                c_elements = chunks[mode][file_name][chunk_size]
                categories = np.array([0, 0, 0])    # good, okay, bad
                
                for c_element in c_elements:
                    orig_elements = c_element.metadata.orig_elements
                    first_element, last_element = orig_elements[0], orig_elements[-1]
                    
                    if isinstance(first_element, (Title, Header)) and not isinstance(last_element, (Title)):
                        categories[0] += 1
                    elif not isinstance(first_element, (Title, Header)) and not isinstance(last_element, (Title)):
                        categories[1] += 1
                    elif isinstance(last_element, (Title)):
                        categories[2] += 1
                
                categories_dict[mode][file_name][chunk_size] = categories
                categories_dict[mode]["total"][chunk_size] = categories_dict[mode]["total"].get(chunk_size, np.array([0, 0, 0])) + categories
    
    return categories_dict


### Basic Categorization

In [17]:
basic_categories = categorize(basic_chunks)

### Title Categorization

In [18]:
title_categories = categorize(title_chunks)

## Visualization

In [19]:
COLORS_CATEGROIES = ["#2ecc71", "#f39c12", "#e74c3c"]
COLORS_CHUNK_LENGTH = ["#3498db", "#9b59b6", "#e74c3c"]
CATEGORIES_STR = ["Good", "Okay", "Bad"]

In [20]:
def save_chart(fig, path, file_name, ext, width=None, height=None, scale=None):
    
    # date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    file_name = f"{file_name}.{ext}"
    full_path = os.path.join(path, file_name)
    
    if ext == "html":
        if not os.path.exists(path):
            os.makedirs(path)
        fig.write_html(full_path, width=width, height=height, scale=scale)
    elif ext == "png":
        if not os.path.exists(path):
            os.makedirs(path)
        fig.write_image(full_path, width=width, height=height, scale=scale)
    elif ext == "svg":
        if not os.path.exists(path):
            os.makedirs(path)
        fig.write_image(full_path, width=width, height=height, scale=scale)

### Visualize a File

In [52]:
file_name = ALL_FILE_NAMES[0]

In [53]:
def create_dataframe_one_file(categories_dict, mode, file_name):
    categories = categories_dict[mode][file_name]
    df = pd.DataFrame(categories, index=CATEGORIES_STR)
    
    return df


def create_bar_chart_one_file(categories_dict, mode, file_name, method, categories_str=CATEGORIES_STR, x_axis_title=None, y_axis_title=None, save=False):
    categories = categories_dict[mode][file_name]
    categories_val = np.array([categories[chunk_size] for chunk_size in CHUNK_SIZES]).T
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    for i in range(len(categories_str)):
        fig.add_trace(go.Bar(
            x=CHUNK_SIZES,
            y=categories_val[i],
            name=categories_str[i],
            text=categories_val[i],
            textposition="outside",
            marker_color=colors[i],
        ))
    
    fig.update_layout(
        title=f"{method.title()} - {mode.title()} - {file_name}",
        xaxis_title=x_axis_title if x_axis_title else"Chunk Size",
        yaxis_title=y_axis_title if y_axis_title else "Number of Chunks",
        barmode="group",
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        xaxis=dict(
            tickvals=CHUNK_SIZES,
            type="category",
        ),
    )
    
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            path = f"Data/{save}/"     
            width = np.min([len(CHUNK_SIZES) * 200 + 400, 1200])
            filename = f"{file_name}_bar_{method}_{mode}"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig


def create_pie_chart_one_file(categories_dict, mode, file_name, method, categories_str=CATEGORIES_STR, save=False):
    categories = categories_dict[mode][file_name]
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    max_cols = 5
    n_cols = max_cols if len(CHUNK_SIZES) >= max_cols else len(CHUNK_SIZES)
    n_rows = math.ceil(len(CHUNK_SIZES) / n_cols)
    
    for i, chunk_size in enumerate(CHUNK_SIZES):
        categories_val = categories[chunk_size]
        
        row = i // n_cols
        col = i % n_cols
        
        fig.add_trace(go.Pie(
            title=f"{chunk_size}",
            labels=categories_str,
            values=categories_val,
            name=f"{chunk_size}",
            textinfo="value+percent",
            hole=0.3,
            domain=dict(row=row, column=col),  
            marker_colors=colors
        ))
    
    fig.update_layout(
        title=f"{method.title()} - {mode.title()} - {file_name}",
        grid=dict(rows=n_rows, columns=n_cols),
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        height=n_rows * 200 + 400,
    )
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            
            path = f"Data/{save}/"      
            width = n_cols * 200 + 400
            filename = f"{file_name}_bar_{method}_{mode}"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig

#### Basic Visualization

In [54]:
method = "basic"

##### Dataframes

In [31]:
mode = MODES[0]
df_b_n = create_dataframe_one_file(basic_categories, mode, file_name)
df_b_n.style.set_caption(f"{file_name} - {method.title()} - {mode.title()}")

Unnamed: 0,500,1000,1300,1500,1800,2000,2300,2500,3000
Good,0,0,0,0,1,0,0,1,0
Okay,43,14,11,11,7,8,6,5,3
Bad,6,6,5,2,2,1,2,1,2


In [758]:
mode = MODES[1]
df_b_a = create_dataframe_one_file(basic_categories, mode, file_name)
df_b_a.style.set_caption(f"{file_name} - {method.title()} - {mode.title()}")

Unnamed: 0,500,1000,1300,1500,1800,2000,2300,2500,3000
Good,0,0,0,0,0,0,0,0,0
Okay,4,3,2,2,2,1,1,1,1
Bad,0,0,0,0,0,0,0,0,0


##### Diagrams

In [55]:
mode = MODES[0]
create_bar_chart_one_file(basic_categories, mode, file_name, method).show()
create_pie_chart_one_file(basic_categories, mode, file_name, method).show()

In [56]:
mode = MODES[1]
create_bar_chart_one_file(basic_categories, mode, file_name, method, x_axis_title="Aimed Chunk Size").show()
create_pie_chart_one_file(basic_categories, mode, file_name, method).show()

In [57]:
mode = MODES[0]
avg_len_dict = {mode: {file_name: {k: [v["average"], k, k] for k, v in basic_len[mode][file_name].items()}}}
create_bar_chart_one_file(avg_len_dict, mode, file_name, method, y_axis_title="Chunk Length", categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()
create_pie_chart_one_file(avg_len_dict, mode, file_name, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()

In [58]:
mode = MODES[1]
avg_len_dict = {mode: {file_name: {k: [v["average"], k, MAX_CHARACTERS[CHUNK_SIZES.index(k)]] for k, v in basic_len[mode][file_name].items()}}}
create_bar_chart_one_file(avg_len_dict, mode, file_name, method, y_axis_title="Chunk Length", categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"]).show()
create_pie_chart_one_file(avg_len_dict, mode, file_name, method, categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"]).show()

#### Title Visualization

In [59]:
method = "title"

##### Dataframes

In [25]:
mode = MODES[0]
df_t_n = create_dataframe_one_file(title_categories, mode, file_name)
df_t_n.style.set_caption(f"{file_name} - {method.title()} - {mode.title()}")

Unnamed: 0,500,1000,1300,1500,1800,2000,2300,2500,3000
Good,24,29,33,30,27,28,26,24,24
Okay,190,71,38,32,27,18,15,12,9
Bad,15,8,7,6,5,5,5,5,5


In [765]:
mode = MODES[1]
df_t_a = create_dataframe_one_file(title_categories, mode, file_name)
df_t_a.style.set_caption(f"{file_name} - {method.title()} - {mode.title()}")

Unnamed: 0,500,1000,1300,1500,1800,2000,2300,2500,3000
Good,0,0,0,0,0,0,0,0,0
Okay,4,2,2,2,2,1,1,1,1
Bad,0,0,0,0,0,0,0,0,0


##### Diagrams

In [60]:
mode = MODES[0]
create_bar_chart_one_file(title_categories, mode, file_name, method).show()
create_pie_chart_one_file(title_categories, mode, file_name, method).show()

In [61]:
mode = MODES[1]
create_bar_chart_one_file(title_categories, mode, file_name, method, x_axis_title="Aimed Chunk Size").show()
create_pie_chart_one_file(title_categories, mode, file_name, method).show()

In [62]:
mode = MODES[0]
avg_len_dict = {mode: {file_name: {k: [v["average"], k, k] for k, v in title_len[mode][file_name].items()}}}
create_bar_chart_one_file(avg_len_dict, mode, file_name, method, y_axis_title="Chunk Length", categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()
create_pie_chart_one_file(avg_len_dict, mode, file_name, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()

In [63]:
mode = MODES[1]
avg_len_dict = {mode: {file_name: {k: [v["average"], k, MAX_CHARACTERS[CHUNK_SIZES.index(k)]] for k, v in title_len[mode][file_name].items()}}}
create_bar_chart_one_file(avg_len_dict, mode, file_name, method, y_axis_title="Chunk Length", x_axis_title="Aimed Chunk Size", categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()
create_pie_chart_one_file(avg_len_dict, mode, file_name, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()

### Visualize a Chunk Size

In [64]:
chunk_size = CHUNK_SIZES[4]
chunk_size

1800

In [65]:
def create_dataframe_one_chunk_size(categories_dict, mode, chunk_size):
    files = {file_name: categories_dict[mode][file_name][chunk_size] for file_name in ALL_FILE_NAMES}
    df = pd.DataFrame(files, index=CATEGORIES_STR).T
    
    return df


def create_bar_chart_one_chunk_size(categories_dict, mode, chunk_size, method, categories_str=CATEGORIES_STR, x_axis_title=None, y_axis_title=None, save=False):
    categories_val = np.array([categories_dict[mode][file_name][chunk_size] for file_name in ALL_FILE_NAMES]).T
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    for i in range(len(categories_str)):
        fig.add_trace(go.Bar(
            x=[i for i in range(len(ALL_FILE_NAMES))],
            y=categories_val[i],
            name=categories_str[i],
            text=categories_val[i],
            textposition="outside",
            marker_color=colors[i],
            hovertext=ALL_FILE_NAMES
        ))
    
    fig.update_layout(
        title=f"{method.title()} - {mode.title()} - {chunk_size}",
        xaxis_title=x_axis_title if x_axis_title else "Chunk Size",
        yaxis_title=y_axis_title if y_axis_title else "Number of Chunks",
        barmode="group",
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        xaxis=dict(
            tickvals=[i for i in range(len(ALL_FILE_NAMES))],
            ticktext=[i+1 for i in range(len(ALL_FILE_NAMES))],
            type="category",
        ),
    )
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            path = f"Data/{save}/"     
            width = np.min([len(ALL_FILE_NAMES) * 200 + 400, 1200])
            filename = f"{chunk_size}_bar_{method}_{mode}_{len(ALL_FILE_NAMES)}_files"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig


def create_pie_chart_one_chunk_size(categories_dict, mode, chunk_size, method, categories_str=CATEGORIES_STR, save=False):
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    max_cols = 5
    n_cols = max_cols if len(ALL_FILE_NAMES) >= max_cols else len(ALL_FILE_NAMES)
    n_rows = math.ceil(len(ALL_FILE_NAMES) / n_cols)
    
    for i, file_name in enumerate(ALL_FILE_NAMES):
        categories_val = categories_dict[mode][file_name][chunk_size]
        
        row = i // n_cols
        col = i % n_cols
        
        fig.add_trace(go.Pie(
            title=f"{i+1}",
            labels=categories_str,
            values=categories_val,
            name=f"{file_name}",
            hole=0.3,
            textinfo="value+percent",
            domain=dict(row=row, column=col),  
            marker_colors=colors
        ))
    
    fig.update_layout(
        title=f"{method.title()} - {mode.title()} - {chunk_size}",
        grid=dict(rows=n_rows, columns=n_cols),
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        height=n_rows * 200 + 400,
    )
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            path = f"Data/{save}/"      
            width = n_cols * 200 + 400
            filename = f"{chunk_size}_bar_{method}_{mode}_{len(ALL_FILE_NAMES)}_files"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig

#### Basic Visualization

In [772]:
method = "basic"

##### Dataframes

In [804]:
# mode = MODES[0]
# df_b_n = create_dataframe_one_chunk_size(basic_categories, mode, chunk_size)
# df_b_n.style.set_caption(f"{chunk_size} - {method.title()} - {mode.title()}")

In [805]:
# mode = MODES[1]
# df_b_n = create_dataframe_one_chunk_size(basic_categories, mode, chunk_size)
# df_b_n.style.set_caption(f"{chunk_size} - {method.title()} - {mode.title()}")

##### Diagrams

In [803]:
# mode = MODES[0]
# create_bar_chart_one_chunk_size(basic_categories, mode, chunk_size, method).show()
# create_pie_chart_one_chunk_size(basic_categories, mode, chunk_size, method).show()

In [802]:
# mode = MODES[1]
# create_bar_chart_one_chunk_size(basic_categories, mode, chunk_size, method).show()
# create_pie_chart_one_chunk_size(basic_categories, mode, chunk_size, method).show()

In [801]:
# mode = MODES[0]
# avg_len_dict = defaultdict(dict)
# for file_name in ALL_FILE_NAMES:
#     avg_len_dict[mode][file_name] = {k: [v["average"], chunk_size, chunk_size] for k, v in basic_len[mode][file_name].items()}

# create_bar_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"], y_axis_title="Chunk Length").show()
# create_pie_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()

In [800]:
# mode = MODES[1]
# avg_len_dict = defaultdict(dict)
# for file_name in ALL_FILE_NAMES:
#     avg_len_dict[mode][file_name] = {k: [v["average"], chunk_size, MAX_CHARACTERS[CHUNK_SIZES.index(chunk_size)]] for k, v in basic_len[mode][file_name].items()}

# create_bar_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"], y_axis_title="Chunk Length").show()
# create_pie_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"]).show()

#### Title Visualization

In [779]:
method = "title"

##### Dataframes

In [798]:
# mode = MODES[0]
# df_b_n = create_dataframe_one_chunk_size(title_categories, mode, chunk_size)
# df_b_n.style.set_caption(f"{chunk_size} - {method.title()} - {mode.title()}")

In [799]:
# mode = MODES[1]
# df_b_n = create_dataframe_one_chunk_size(title_categories, mode, chunk_size)
# df_b_n.style.set_caption(f"{chunk_size} - {method.title()} - {mode.title()}")

##### Diagrams

In [34]:
# mode = MODES[0]
# create_bar_chart_one_chunk_size(title_categories, mode, chunk_size, method).show()
# create_pie_chart_one_chunk_size(title_categories, mode, chunk_size, method).show()

In [796]:
# mode = MODES[1]
# create_bar_chart_one_chunk_size(title_categories, mode, chunk_size, method).show()
# create_pie_chart_one_chunk_size(title_categories, mode, chunk_size, method).show()

In [36]:
# mode = MODES[0]
# avg_len_dict = defaultdict(dict)
# for file_name in ALL_FILE_NAMES:
#     avg_len_dict[mode][file_name] = {k: [v["average"], chunk_size, chunk_size] for k, v in title_len[mode][file_name].items()}

# create_bar_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"], y_axis_title="Chunk Length").show()
# create_pie_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Chunk Size", "Hard Cap"]).show()

In [38]:
# mode = MODES[1]
# avg_len_dict = defaultdict(dict)
# for file_name in ALL_FILE_NAMES:
#     avg_len_dict[mode][file_name] = {k: [v["average"], chunk_size, MAX_CHARACTERS[CHUNK_SIZES.index(chunk_size)]] for k, v in title_len[mode][file_name].items()}

# create_bar_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"], y_axis_title="Chunk Length").show()
# create_pie_chart_one_chunk_size(avg_len_dict, mode, chunk_size, method, categories_str=["Average Chunk Length", "Aimed Chunk Size", "Hard Cap"]).show()

### All in One

In [21]:
save = "svg"

In [22]:
def create_bar_chart_all_files_all_chunk_sizes(categories_dict, mode, method, categories_str=CATEGORIES_STR, x_axis_title=None, y_axis_title=None, save=False):
    categories_val = np.array([categories_dict[mode]["total"][chunk_size] for chunk_size in CHUNK_SIZES]).T
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    for i in range(len(categories_str)):
        fig.add_trace(go.Bar(
            x=CHUNK_SIZES,
            y=categories_val[i],
            name=categories_str[i],
            text=categories_val[i],
            textposition="outside",
            marker_color=colors[i],
        ))
    
    fig.update_layout(
        title=f"{len(ALL_FILE_NAMES)} Files - {method.title()} - {mode.title()}",
        xaxis_title=x_axis_title if x_axis_title else "Chunk Size",
        yaxis_title=y_axis_title if y_axis_title else "Number of Chunks",
        barmode="group",
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        xaxis=dict(
            tickvals=CHUNK_SIZES,
            type="category",
        ),
    )
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            path = f"Data/{save}/"     
            width = np.min([len(CHUNK_SIZES) * 200 + 400, 1200])
            filename = f"total_bar_{method}_{mode}_{len(ALL_FILE_NAMES)}_files"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig


def create_pie_chart_all_files_all_chunk_sizes(categories_dict, mode, method, categories_str=CATEGORIES_STR, save=False):
    colors = COLORS_CATEGROIES if categories_str == CATEGORIES_STR else COLORS_CHUNK_LENGTH
    fig = go.Figure()
    
    max_cols = 5
    n_cols = max_cols if len(ALL_FILE_NAMES) >= max_cols else len(ALL_FILE_NAMES)
    n_rows = math.ceil(len(CHUNK_SIZES) / n_cols)
    
    for i, chunk_size in enumerate(CHUNK_SIZES):
        categories_val = categories_dict[mode]["total"][chunk_size]
        
        row = i // n_cols
        col = i % n_cols
        
        fig.add_trace(go.Pie(
            title=f"{chunk_size}",
            labels=categories_str,
            values=categories_val,
            name=f"{chunk_size}",
            textinfo="value+percent",
            hole=0.3,
            domain=dict(row=row, column=col),  
            marker_colors=colors
        ))
    
    fig.update_layout(
        title=f"{len(ALL_FILE_NAMES)} Files - {method.title()} - {mode.title()}",
        grid=dict(rows=n_rows, columns=n_cols),
        uniformtext_minsize=8,
        uniformtext_mode="hide",
        height=n_rows * 200 + 400,
    )
    
    valid_extensions = ('svg', 'png', 'html')
    if save:
        if save.endswith(valid_extensions):
            path = f"Data/{save}/"      
            width = n_cols * 200 + 400
            filename = f"total_pie_{method}_{mode}_{len(ALL_FILE_NAMES)}_files"
            save_chart(fig, path, filename, save, width=width)
        else:
            raise ValueError(f"Invalid value for save parameter. Use {valid_extensions}.")
    
    return fig

#### Basic Visualization

In [23]:
method = "basic"

In [24]:
mode = MODES[0]
create_bar_chart_all_files_all_chunk_sizes(basic_categories, mode, method, save=save).show()
create_pie_chart_all_files_all_chunk_sizes(basic_categories, mode, method, save=save).show()

In [25]:
mode = MODES[1]
create_bar_chart_all_files_all_chunk_sizes(basic_categories, mode, method, save=save).show()
create_pie_chart_all_files_all_chunk_sizes(basic_categories, mode, method, save=save).show()

#### Title Visualization

In [26]:
method = "title"

In [27]:
mode = MODES[0]
create_bar_chart_all_files_all_chunk_sizes(title_categories, mode, method, save=save).show()
create_pie_chart_all_files_all_chunk_sizes(title_categories, mode, method, save=save).show()

In [28]:
mode = MODES[1]
create_bar_chart_all_files_all_chunk_sizes(title_categories, mode, method, x_axis_title="Aimed Chunk Size", save=save).show()
create_pie_chart_all_files_all_chunk_sizes(title_categories, mode, method, save=save).show()