From b3a64097f798e8048c345cfc316750ca1f0a00e3 Mon Sep 17 00:00:00 2001 From: "nathaphat t." <76417777+putt-t@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:19:08 -0700 Subject: [PATCH] chunk fixes - based on chonkie's new documentation --- examples/chunking/chunking.ipynb | 144 +++++++++---------------------- helix/chunk.py | 84 +++++------------- pyproject.toml | 2 +- 3 files changed, 63 insertions(+), 167 deletions(-) diff --git a/examples/chunking/chunking.ipynb b/examples/chunking/chunking.ipynb index a3e2da4..4e281b1 100644 --- a/examples/chunking/chunking.ipynb +++ b/examples/chunking/chunking.ipynb @@ -30,7 +30,16 @@ "execution_count": 1, "id": "2b89d611", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import helix" ] @@ -168,7 +177,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 batches chunked [00:00<00:00, 19599.55batch/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 batches chunked [00:00<00:00, 19784.45batch/s] 🌱\n" ] }, { @@ -228,7 +237,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 27.60doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 29.97doc/s] 🌱\n" ] }, { @@ -288,7 +297,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 25.25doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 30.42doc/s] 🌱\n" ] }, { @@ -348,7 +357,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 7717.21doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 9998.34doc/s] 🌱\n" ] }, { @@ -386,20 +395,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/chonkie/embeddings/model2vec.py:63: RuntimeWarning: invalid value encountered in divide\n", + " return np.divide(\n" ] }, { "data": { "text/plain": [ - "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.',\n", - " '\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n", + "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization.',\n", + " ' Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n", " ' This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers.',\n", - " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.',\n", - " '\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications.',\n", - " ' The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing.',\n", - " ' By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" + " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications. The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" ] }, "execution_count": 13, @@ -422,7 +428,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 2894.62doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 6432.98doc/s] 🌱\n" ] }, { @@ -442,72 +448,6 @@ "batch_chunks" ] }, - { - "cell_type": "markdown", - "id": "4363d271", - "metadata": {}, - "source": [ - "### SDPM Chunker" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d2e2b2b8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.',\n", - " '\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n", - " ' This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers.',\n", - " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.',\n", - " '\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications.',\n", - " ' The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing.',\n", - " ' By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chunks = helix.Chunk.sdp_chunk(massive_text_blob)\n", - "chunks" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "6a49d6f3", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 3889.02doc/s] 🌱\n" - ] - }, - { - "data": { - "text/plain": [ - "['First document to chunk with some content for testing.',\n", - " 'Second document with different content for batch processing.']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "batch_chunks = helix.Chunk.sdp_chunk(texts)\n", - "batch_chunks" - ] - }, { "cell_type": "markdown", "id": "f74c8cab", @@ -518,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "ad30713e", "metadata": {}, "outputs": [ @@ -535,7 +475,7 @@ "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications. The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" ] }, - "execution_count": 17, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -547,7 +487,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "065c65e7", "metadata": {}, "outputs": [ @@ -555,7 +495,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 22.21doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 25.55doc/s] 🌱\n" ] }, { @@ -565,7 +505,7 @@ " 'Second document with different content for batch processing.']" ] }, - "execution_count": 18, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -585,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "76033b2d", "metadata": {}, "outputs": [ @@ -609,7 +549,7 @@ " '. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -621,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "bacacfac", "metadata": {}, "outputs": [ @@ -630,7 +570,7 @@ "output_type": "stream", "text": [ "Device set to use cpu\n", - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 13.79doc/s] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 16.61doc/s] 🌱\n" ] }, { @@ -640,7 +580,7 @@ " 'Second document with different content for batch processing.']" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -668,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "id": "36fdf7a8", "metadata": {}, "outputs": [ @@ -678,7 +618,7 @@ "True" ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -690,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "3f3fdc9e", "metadata": {}, "outputs": [ @@ -698,18 +638,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 36/36 splits processed [00:53<00:00, 1.49s/split] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 36/36 splits processed [00:24<00:00, 1.46split/s] 🌱\n" ] }, { "data": { "text/plain": [ - "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. Itcontainsmultiplesentencesandparagraphsthatneedtobedividedappropriatelytomaintaincontextwhilefittingwithintokenlimits.When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. ',\n", - " 'For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n',\n", + "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. Itcontainsmultiplesentencesandparagraphsthatneedtobedividedappropriatelytomaintaincontextwhilefittingwithintokenlimits.When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. ',\n", + " 'Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n',\n", " '\\nThisexampledemonstrateshowthetokenchunkerworkswitharealistictextsamplethatwouldbecommonindocumentprocessingandRAG(Retrieval-Augmented Generation) applications. Thechunkswillbecreatedwithspecifiedtokenlimitsandoverlapsettingstooptimizeforbothcomprehensionandprocessingefficiency.Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -721,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "6cf1fdff", "metadata": {}, "outputs": [ @@ -729,9 +669,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00, 5.80s/split] 🌱\n", - "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00, 5.17s/split] 🌱\n", - "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:10<00:00, 5.49s/doc] 🌱\n" + "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00, 5.79s/split] 🌱\n", + "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00, 5.71s/split] 🌱\n", + "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:11<00:00, 5.75s/doc] 🌱\n" ] }, { @@ -741,7 +681,7 @@ " 'Second document with different content for batch processing.']" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } diff --git a/helix/chunk.py b/helix/chunk.py index 7489b83..85b5396 100644 --- a/helix/chunk.py +++ b/helix/chunk.py @@ -1,4 +1,4 @@ -from chonkie import TokenChunker, SentenceChunker, RecursiveChunker, RecursiveRules, CodeChunker, SemanticChunker, SDPMChunker, LateChunker, NeuralChunker, SlumberChunker +from chonkie import TokenChunker, SentenceChunker, RecursiveChunker, RecursiveRules, CodeChunker, SemanticChunker, LateChunker, NeuralChunker, SlumberChunker from chonkie.genie import GeminiGenie from typing import List, Optional, Union, Any from tokenizers import Tokenizer @@ -142,28 +142,30 @@ def code_chunk(text: Union[str, List[str]], language: str, tokenizer: str = "cha # this is for chonkie semantic chunker @staticmethod def semantic_chunk(text: Union[str, List[str]], embedding_model: str = "minishlab/potion-base-8M", - threshold: Union[float, int, str] = "auto", chunk_size: int = 2048, - mode: str = "window", min_sentences: int = 1, similarity_window: int = 1, - min_chunk_size: int = 2, min_characters_per_sentence: int = 12, - threshold_step: float = 0.01, + threshold: float = 0.8, chunk_size: int = 2048, + similarity_window: int = 3, min_sentences_per_chunk: int = 1, + min_characters_per_sentence: int = 24, skip_window: int = 0, + filter_window: int = 5, filter_polyorder: int = 3, filter_tolerance: float = 0.2, delim: Union[str, List[str]] = ['.', '!', '?', '\n'], - include_delim: Optional[str] = "prev") -> Union[List[str], List[List[str]]]: + include_delim: Optional[str] = "prev", **embedding_kwargs) -> Union[List[str], List[List[str]]]: """ Chunk text based on semantic similarity between sentences. Args: text (Union[str, List[str]]): Text to chunk (single string or list of strings). embedding_model (str, optional): Model to use for embeddings. Defaults to "minishlab/potion-base-8M". - threshold (Union[float, int, str], optional): Similarity threshold for chunking. Defaults to "auto". + threshold (float, optional): Similarity threshold for chunking (0-1). Defaults to 0.8. chunk_size (int, optional): Maximum size of each chunk in tokens. Defaults to 2048. - mode (str, optional): Chunking mode ("window" or "cluster"). Defaults to "window". - min_sentences (int, optional): Minimum sentences per chunk. Defaults to 1. - similarity_window (int, optional): Window size for similarity calculation. Defaults to 1. - min_chunk_size (int, optional): Minimum number of sentences per chunk. Defaults to 2. - min_characters_per_sentence (int, optional): Minimum characters per sentence. Defaults to 12. - threshold_step (float, optional): Step size for threshold adjustment. Defaults to 0.01. + similarity_window (int, optional): Window size for similarity calculation. Defaults to 3. + min_sentences_per_chunk (int, optional): Minimum sentences per chunk. Defaults to 1. + min_characters_per_sentence (int, optional): Minimum characters per sentence. Defaults to 24. + skip_window (int, optional): Number of groups to skip when merging similar content. Defaults to 0. + filter_window (int, optional): Window length for Savitzky-Golay filter. Defaults to 5. + filter_polyorder (int, optional): Polynomial order for Savitzky-Golay filter. Defaults to 3. + filter_tolerance (float, optional): Tolerance for filter boundary detection. Defaults to 0.2. delim (Union[str, List[str]], optional): Sentence delimiters. Defaults to ['.', '!', '?', '\n']. include_delim (Optional[str], optional): How to include delimiters ("prev", "next", None). Defaults to "prev". + **embedding_kwargs: Additional keyword arguments for the embedding model. Returns: Union[List[str], List[List[str]]]: List of semantically coherent text chunks. @@ -172,65 +174,19 @@ def semantic_chunk(text: Union[str, List[str]], embedding_model: str = "minishla embedding_model=embedding_model, threshold=threshold, chunk_size=chunk_size, - mode=mode, - min_sentences=min_sentences, similarity_window=similarity_window, - min_chunk_size=min_chunk_size, + min_sentences_per_chunk=min_sentences_per_chunk, min_characters_per_sentence=min_characters_per_sentence, - threshold_step=threshold_step, + skip_window=skip_window, + filter_window=filter_window, + filter_polyorder=filter_polyorder, + filter_tolerance=filter_tolerance, delim=delim, include_delim=include_delim ) return Chunk._process_chunks(chunker, text) - # this is for chonkie SDPM chunker - @staticmethod - def sdp_chunk(text: Union[str, List[str]], embedding_model: str = "minishlab/potion-base-8M", - threshold: Union[float, int, str] = "auto", chunk_size: int = 2048, - mode: str = "window", min_sentences: int = 1, similarity_window: int = 1, - min_chunk_size: int = 2, min_characters_per_sentence: int = 12, - threshold_step: float = 0.01, - delim: Union[str, List[str]] = ['.', '!', '?', '\n'], - include_delim: Optional[str] = "prev", - skip_window: int = 1) -> Union[List[str], List[List[str]]]: - """ - Chunk text using the Skip-Distance Proximity Method (SDPM) for enhanced semantic chunking. - - Args: - text (Union[str, List[str]]): Text to chunk (single string or list of strings). - embedding_model (str, optional): Model to use for embeddings. Defaults to "minishlab/potion-base-8M". - threshold (Union[float, int, str], optional): Similarity threshold for chunking. Defaults to "auto". - chunk_size (int, optional): Maximum size of each chunk in tokens. Defaults to 2048. - mode (str, optional): Chunking mode ("window" or "cluster"). Defaults to "window". - min_sentences (int, optional): Minimum sentences per chunk. Defaults to 1. - similarity_window (int, optional): Window size for similarity calculation. Defaults to 1. - min_chunk_size (int, optional): Minimum number of sentences per chunk. Defaults to 2. - min_characters_per_sentence (int, optional): Minimum characters per sentence. Defaults to 12. - threshold_step (float, optional): Step size for threshold adjustment. Defaults to 0.01. - delim (Union[str, List[str]], optional): Sentence delimiters. Defaults to ['.', '!', '?', '\n']. - include_delim (Optional[str], optional): How to include delimiters ("prev", "next", None). Defaults to "prev". - skip_window (int, optional): Number of sentences to skip when calculating similarity. Defaults to 1. - - Returns: - Union[List[str], List[List[str]]]: List of text chunks with improved semantic coherence. - """ - chunker = SDPMChunker( - embedding_model=embedding_model, - threshold=threshold, - chunk_size=chunk_size, - mode=mode, - min_sentences=min_sentences, - similarity_window=similarity_window, - min_chunk_size=min_chunk_size, - min_characters_per_sentence=min_characters_per_sentence, - threshold_step=threshold_step, - delim=delim, - include_delim=include_delim, - skip_window=skip_window - ) - - return Chunk._process_chunks(chunker, text) # this is for chonkie late chunker @staticmethod diff --git a/pyproject.toml b/pyproject.toml index 2b31d46..4da158a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ keywords = [ ] dependencies = [ "anthropic>=0.64.0", - "chonkie>=1.1.1", + "chonkie[all]>=1.2.1", "fastmcp>=2.10.6", "google-genai>=1.31.0", "numpy >= 2.0.2",