In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import AutoTokenizer

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

from datasets import load_dataset
ds = (
            load_dataset("igzi/pile-stem-corpus-small", split="train")
            .shuffle(seed=42)
            .select(range(min(1000, len(load_dataset("igzi/pile-stem-corpus-small", split="train")))))
        )

# 1. Load a tokenizer from HuggingFace (any model will do for this test)
tokenizer = AutoTokenizer.from_pretrained("igzi/MNLP_M2_document_encoder")
splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    chunk_size=512,  # tokens
    chunk_overlap=512//10,  # tokens
    add_start_index=True,
    strip_whitespace=True,
    separators=MARKDOWN_SEPARATORS,
)
for i in range(10):
    # 2. Create a sample markdown-style document
    sample_text = ds[i]["text"]
    
    # 3. Wrap the text in a LangChain Document
    doc = Document(page_content=sample_text)
    
    # 6. Perform the splitting
    chunks = splitter.split_documents([doc])
print(len(sample_text))
print(len(chunks))
print(sample_text[:10])

import re
print(len(sample_text.split(MARKDOWN_SEPARATORS[0])))

394988
375

#{1,6} Q:
501


In [6]:
print(chunks[283])

page_content='#{1,6} Q:

Proving function has simple pole and residue Suppose $f$ is analytic and not constant on the domain $D \subseteq \mathbb{C}$.
  
  If $z_0 \in D$ is a zero of $f$ of order $k$, show that the
  function $\frac{f'(z)}{f(z)}$ has a simple pole at $z_0$ with residue
  $k$.


I am not entirely sure how to manipulate the definition of a residue of orders greater than 1 in order to show this.

A:

Hint: $f(z) = g(z) (z - z_0)^k$ where $g(z_0) \ne 0$ is analytic on $D$.  What does this say about $f'(z)/f(z)$?
#{1,6} Q:

What does it mean: $a\in X$ is open in $X \subset \mathbb{R^n}$ I'm in a course of multivariable real analisys and I have to prove this:


  $a\in X$ is open in $X\subset\mathbb{R}^n$ (in the related topology  to $X$) if and only if   $a$ is a isolated point.


I don't understand what does  the first part means, after this I can do the proof, please unclear my doubts.

A:

Here is a more accurate statement: "For each subset $X \subset \mathbb{R}$ and ea

In [29]:
for chunk in chunks:
    if len(tokenizer.encode(chunk.page_content, add_special_tokens=False))>512:
        print(chunk)
        break

In [12]:
pattern = r"\n#{1,6} "

# Split the sample text using regex
chunks = re.split(pattern, sample_text)

# Compute token length of each chunk
chunk_sizes = [len(tokenizer.encode(chunk, add_special_tokens=False)) for chunk in chunks]

# Print results
for i, size in enumerate(chunk_sizes):
    print(f"Chunk {i + 1}: {size} tokens")

# Optionally, summary statistics
print(f"\nTotal chunks: {len(chunk_sizes)}")
print(f"Average size: {sum(chunk_sizes)/len(chunk_sizes):.2f} tokens")
print(f"Max size: {max(chunk_sizes)} tokens")
print(f"Min size: {min(chunk_sizes)} tokens")

Chunk 1: 67 tokens
Chunk 2: 76 tokens
Chunk 3: 68 tokens
Chunk 4: 119 tokens
Chunk 5: 158 tokens
Chunk 6: 97 tokens
Chunk 7: 201 tokens
Chunk 8: 170 tokens
Chunk 9: 142 tokens
Chunk 10: 65 tokens
Chunk 11: 170 tokens
Chunk 12: 110 tokens
Chunk 13: 82 tokens
Chunk 14: 398 tokens
Chunk 15: 128 tokens
Chunk 16: 241 tokens
Chunk 17: 179 tokens
Chunk 18: 77 tokens
Chunk 19: 63 tokens
Chunk 20: 256 tokens
Chunk 21: 267 tokens
Chunk 22: 115 tokens
Chunk 23: 228 tokens
Chunk 24: 385 tokens
Chunk 25: 117 tokens
Chunk 26: 262 tokens
Chunk 27: 260 tokens
Chunk 28: 85 tokens
Chunk 29: 193 tokens
Chunk 30: 161 tokens
Chunk 31: 137 tokens
Chunk 32: 69 tokens
Chunk 33: 117 tokens
Chunk 34: 329 tokens
Chunk 35: 360 tokens
Chunk 36: 169 tokens
Chunk 37: 54 tokens
Chunk 38: 140 tokens
Chunk 39: 122 tokens
Chunk 40: 133 tokens
Chunk 41: 288 tokens
Chunk 42: 91 tokens
Chunk 43: 142 tokens
Chunk 44: 62 tokens
Chunk 45: 301 tokens
Chunk 46: 98 tokens
Chunk 47: 202 tokens
Chunk 48: 147 tokens
Chunk 49: 51 to