HelixDB · Gahnxd · Aug 29, 2025 · Aug 29, 2025
diff --git a/examples/chunking/chunking.ipynb b/examples/chunking/chunking.ipynb
@@ -30,7 +30,16 @@
    "execution_count": 1,
    "id": "2b89d611",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import helix"
    ]
@@ -168,7 +177,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 batches chunked [00:00<00:00, 19599.55batch/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 batches chunked [00:00<00:00, 19784.45batch/s] 🌱\n"
      ]
     },
     {
@@ -228,7 +237,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 27.60doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 29.97doc/s] 🌱\n"
      ]
     },
     {
@@ -288,7 +297,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 25.25doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 30.42doc/s] 🌱\n"
      ]
     },
     {
@@ -348,7 +357,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 7717.21doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 9998.34doc/s] 🌱\n"
      ]
     },
     {
@@ -386,20 +395,17 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+      "/Users/putt/Documents/Github/helix-py/.venv/lib/python3.11/site-packages/chonkie/embeddings/model2vec.py:63: RuntimeWarning: invalid value encountered in divide\n",
+      "  return np.divide(\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.',\n",
-       " '\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n",
+       "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization.',\n",
+       " ' Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n",
        " ' This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers.',\n",
-       " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.',\n",
-       " '\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications.',\n",
-       " ' The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing.',\n",
-       " ' By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
+       " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications. The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
       ]
      },
      "execution_count": 13,
@@ -422,7 +428,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 2894.62doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 6432.98doc/s] 🌱\n"
      ]
     },
     {
@@ -442,72 +448,6 @@
     "batch_chunks"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "4363d271",
-   "metadata": {},
-   "source": [
-    "### SDPM Chunker"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "d2e2b2b8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.',\n",
-       " '\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text.',\n",
-       " ' This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers.',\n",
-       " ' Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.',\n",
-       " '\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications.',\n",
-       " ' The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing.',\n",
-       " ' By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chunks = helix.Chunk.sdp_chunk(massive_text_blob)\n",
-    "chunks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "6a49d6f3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 3889.02doc/s] 🌱\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['First document to chunk with some content for testing.',\n",
-       " 'Second document with different content for batch processing.']"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "batch_chunks = helix.Chunk.sdp_chunk(texts)\n",
-    "batch_chunks"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "f74c8cab",
@@ -518,7 +458,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "ad30713e",
    "metadata": {},
    "outputs": [
@@ -535,7 +475,7 @@
        "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. It contains multiple sentences and paragraphs that need to be divided appropriately to maintain context while fitting within token limits. When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n\\nThis example demonstrates how the token chunker works with a realistic text sample that would be common in document processing and RAG (Retrieval-Augmented Generation) applications. The chunks will be created with specified token limits and overlap settings to optimize for both comprehension and processing efficiency. Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -547,15 +487,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "065c65e7",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 22.21doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 25.55doc/s] 🌱\n"
      ]
     },
     {
@@ -565,7 +505,7 @@
        " 'Second document with different content for batch processing.']"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -585,7 +525,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "id": "76033b2d",
    "metadata": {},
    "outputs": [
@@ -609,7 +549,7 @@
        " '. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -621,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "id": "bacacfac",
    "metadata": {},
    "outputs": [
@@ -630,7 +570,7 @@
      "output_type": "stream",
      "text": [
       "Device set to use cpu\n",
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 13.79doc/s] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:00<00:00, 16.61doc/s] 🌱\n"
      ]
     },
     {
@@ -640,7 +580,7 @@
        " 'Second document with different content for batch processing.']"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -668,7 +608,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 19,
    "id": "36fdf7a8",
    "metadata": {},
    "outputs": [
@@ -678,7 +618,7 @@
        "True"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -690,26 +630,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 20,
    "id": "3f3fdc9e",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 36/36 splits processed [00:53<00:00,  1.49s/split] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 36/36 splits processed [00:24<00:00,  1.46split/s] 🌱\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. Itcontainsmultiplesentencesandparagraphsthatneedtobedividedappropriatelytomaintaincontextwhilefittingwithintokenlimits.When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. ',\n",
-       " 'For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n',\n",
+       "['\\nThis is a massive text blob that we want to chunk into smaller pieces for processing. Itcontainsmultiplesentencesandparagraphsthatneedtobedividedappropriatelytomaintaincontextwhilefittingwithintokenlimits.When working with large documents, it is important to ensure that each chunk maintains enough context for downstream tasks, such as retrieval or summarization. Chunking strategies can vary depending on the use case, but the goal is always to balance context preservation with processing efficiency.\\n\\nThe chunker should handle overlaps properly to ensure no important information is lost at chunk boundaries. For example, if a sentence is split between two chunks, the overlap ensures that both chunks retain the full meaning of the text. This is especially important in applications like document question answering, where missing a single sentence could lead to incorrect answers. ',\n",
+       " 'Additionally, chunkers may need to account for different languages, code blocks, or special formatting, which can add complexity to the chunking process.\\n',\n",
        " '\\nThisexampledemonstrateshowthetokenchunkerworkswitharealistictextsamplethatwouldbecommonindocumentprocessingandRAG(Retrieval-Augmented Generation) applications. Thechunkswillbecreatedwithspecifiedtokenlimitsandoverlapsettingstooptimizeforbothcomprehensionandprocessingefficiency.Each chunk will contain metadata about its position in the original text and token count for further processing. By using a robust chunking strategy, we can ensure that downstream models receive high-quality, context-rich input, improving the overall performance of NLP pipelines and applications.\\n']"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -721,17 +661,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 21,
    "id": "6cf1fdff",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00,  5.80s/split] 🌱\n",
-      "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00,  5.17s/split] 🌱\n",
-      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:10<00:00,  5.49s/doc] 🌱\n"
+      "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00,  5.79s/split] 🌱\n",
+      "🦛 choooooooooooooooooooonk 100% • 1/1 splits processed [00:05<00:00,  5.71s/split] 🌱\n",
+      "🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:11<00:00,  5.75s/doc] 🌱\n"
      ]
     },
     {
@@ -741,7 +681,7 @@
        " 'Second document with different content for batch processing.']"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }