In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [2]:
import logging


logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
def chunk(
    text: str, positions: list[int], lengths: list[int], max_window_size: int = 200
) -> list[str]:
    if len(positions) != len(lengths):
        raise ValueError()

    n = len(positions)
    chunks: list[str] = []

    # --- 1) Build the first window ---
    # If the very first entity is by itself too big, we can never chunk it.
    first_len = lengths[0]

    if first_len > max_window_size:
        raise ValueError(
            f'Entity at position {positions[0]} has length {first_len}, '
            f'which exceeds max_window_size={max_window_size}'
        )

    start_index = 0
    end_index = 0
    # Greedily expand until adding the next entity would overflow
    while end_index < n:
        entity_end = positions[end_index] + lengths[end_index]

        if entity_end <= max_window_size:
            end_index += 1
        else:
            break

    # Emit the first chunk
    first_end = positions[end_index - 1] + lengths[end_index - 1]
    chunks.append(text[0:first_end])

    # --- 2) Slide the window over each subsequent entity ---
    for current_index in range(end_index, n):
        new_start = positions[current_index]
        last_end = positions[current_index - 1] + lengths[current_index - 1]

        # Gap check
        gap = new_start - last_end

        if gap > max_window_size:
            raise ValueError(
                f'Gap of {gap} chars between entity ending at {last_end} '
                f'and next entity at {new_start} exceeds max_window_size={max_window_size}'
            )

        # Slide off old entities until the new one fits
        while start_index < current_index:
            window_start = positions[start_index] if start_index > 0 else 0
            candidate_end = new_start + lengths[current_index]

            if candidate_end - window_start <= max_window_size:
                break
            start_index += 1

        window_start = positions[start_index] if start_index > 0 else 0
        chunks.append(text[window_start : new_start + lengths[current_index]])

    return chunks