Part 2: Compare Three LlamaIndex Chunking Techniques (Retrieval-Only RAG)

Step 1. Environment Setup

%pip install -U "llama-index==0.14.2" \
  llama-index-embeddings-huggingface sentence-transformers faiss-cpu numpy pandas


In [2]:
import sys
print("Python:", sys.executable)

# verify package is in THIS kernel
from importlib.metadata import version
print("llama-index:", version("llama-index"))

# new-style imports for 0.14+
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
print("Imports OK ✅")


Python: /usr/local/bin/python3.11
llama-index: 0.14.2


  from .autonotebook import tqdm as notebook_tqdm


Imports OK ✅


In [3]:
import os, urllib.request, ssl, certifi, textwrap, time, math
import numpy as np, pandas as pd

TINY_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
DATA_PATH = "tinyshakespeare.txt"

def fetch_with_certifi(url, path):
    if os.path.exists(path):
        return
    ctx = ssl.create_default_context(cafile=certifi.where())
    with urllib.request.urlopen(url, context=ctx, timeout=60) as r, open(path, "wb") as f:
        f.write(r.read())

fetch_with_certifi(TINY_URL, DATA_PATH)
raw_text = open(DATA_PATH, "r", encoding="utf-8").read()
print("Chars:", len(raw_text))
print(raw_text[:500])


Chars: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


Step 2 — embeddings (HF MiniLM)

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed  # global default for LlamaIndex

qe = embed.get_text_embedding("hello world")
print("Embedding dim:", len(qe), "| first 8:", [round(v,4) for v in qe[:8]])


Embedding dim: 384 | first 8: [-0.0345, 0.031, 0.0067, 0.0261, -0.0394, -0.1603, 0.0669, -0.0064]


Step 3 - wrap text as a Document

In [5]:
from llama_index.core import Document
doc = Document(text=raw_text)


Step 4 - TOKEN chunking → nodes + index

In [6]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core import VectorStoreIndex
import numpy as np

tok_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=50)
token_nodes = tok_splitter.get_nodes_from_documents([doc])

token_index = VectorStoreIndex(token_nodes)  # in-memory

def stats(nodes):
    lens = [len(n.get_content()) for n in nodes]
    return dict(n=len(lens), avg_len=round(float(np.mean(lens)),1))

print("TOKEN stats:", stats(token_nodes))


TOKEN stats: {'n': 1480, 'avg_len': 935.4}


Step 5 - retrieval helper (table + metrics)

In [7]:
import time, textwrap, pandas as pd
from numpy.linalg import norm
import numpy as np

def cosine(a, b):
    a = np.array(a, dtype=np.float32); b = np.array(b, dtype=np.float32)
    return float(np.dot(a,b) / (norm(a)*norm(b) + 1e-12))

def run_retrieval(index, nodes, query, k=5, label="TOKEN"):
    qv = embed.get_text_embedding(query)
    print(f"\n---- {label}----")
    print("query vec shape:", np.shape(qv), "| first 8:", [round(v,4) for v in qv[:8]])

    retriever = index.as_retriever(similarity_top_k=k)
    t0 = time.perf_counter()
    results = retriever.retrieve(query)
    latency_ms = (time.perf_counter() - t0) * 1000

    rows, doc_vecs = [], []
    for rank, r in enumerate(results, 1):
        node = getattr(r, "node", r)
        store_score = getattr(r, "score", None)
        text = node.get_content()
        dv = embed.get_text_embedding(text)
        doc_vecs.append(dv)
        rows.append({
            "rank": rank,
            "store_score": None if store_score is None else round(float(store_score),4),
            "cosine_sim": round(cosine(qv, dv), 4),
            "chunk_len": len(text),
            "preview": textwrap.shorten(text.replace("\n"," "), width=160, placeholder="…"),
        })
    doc_mat = np.array(doc_vecs)
    print("doc matrix shape:", doc_mat.shape)

    df = pd.DataFrame(rows, columns=["rank","store_score","cosine_sim","chunk_len","preview"])
    display(df)

    top1 = max(r["cosine_sim"] for r in rows) if rows else float("nan")
    mean_k = round(float(np.mean([r["cosine_sim"] for r in rows])),4) if rows else float("nan")
    return {
        "technique": label,
        "top1_cosine": top1,
        "mean@k": mean_k,
        "k": k,
        "num_chunks": len(nodes),
        "avg_chunk_len": stats(nodes)["avg_len"],
        "latency_ms": round(latency_ms, 2),
    }


Step 6 - Run TOKEN retrieval on the assignment query

In [8]:
QUERY = "Who are the two feuding houses?"
metrics_token = run_retrieval(token_index, token_nodes, QUERY, k=5, label="TOKEN")
metrics_token



---- TOKEN----
query vec shape: (384,) | first 8: [-0.0041, 0.0067, -0.0163, 0.0026, -0.0501, 0.0021, -0.0269, -0.0752]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.3252,0.3252,1004,"together; ay, and 'twere pity To sunder them t..."
1,2,0.3175,0.3175,1014,him worthy whose offence subdues him And curse...
2,3,0.3045,0.3045,921,house shall move me to stand: I will take the ...
3,4,0.3039,0.3039,934,possible. Messenger: The nobles in great earne...
4,5,0.3013,0.3013,1045,"the Capitol; who's like to rise, Who thrives a..."


{'technique': 'TOKEN',
 'top1_cosine': 0.3252,
 'mean@k': 0.3105,
 'k': 5,
 'num_chunks': 1480,
 'avg_chunk_len': 935.4,
 'latency_ms': 47.75}

Step 7 - SEMANTIC chunking → index + retrieval

In [9]:
from llama_index.core.node_parser import SemanticSplitterNodeParser

sem_parser = SemanticSplitterNodeParser(
    buffer_size=100,
    breakpoint_percentile_threshold=95,
    embed_model=embed
)
semantic_nodes = sem_parser.get_nodes_from_documents([doc])
semantic_index = VectorStoreIndex(semantic_nodes)

print("SEMANTIC stats:", stats(semantic_nodes))
metrics_semantic = run_retrieval(semantic_index, semantic_nodes, QUERY, k=5, label="SEMANTIC")
metrics_semantic


SEMANTIC stats: {'n': 624, 'avg_len': 1787.5}

---- SEMANTIC----
query vec shape: (384,) | first 8: [-0.0041, 0.0067, -0.0163, 0.0026, -0.0501, 0.0021, -0.0269, -0.0752]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.3535,0.3535,252,NORTHUMBERLAND: Reproach and dissolution hange...
1,2,0.3119,0.3119,176,"WARWICK: And mine, fair lady Bona, joins with ..."
2,3,0.299,0.299,536,SICINIUS: This is most likely! BRUTUS: Raised ...
3,4,0.2951,0.2951,1155,"But, tell me, is young George Stanley living? ..."
4,5,0.2907,0.2907,163,LORD ROSS: And living too; for now his son is ...


{'technique': 'SEMANTIC',
 'top1_cosine': 0.3535,
 'mean@k': 0.31,
 'k': 5,
 'num_chunks': 624,
 'avg_chunk_len': 1787.5,
 'latency_ms': 27.58}

Step 8 - SENTENCE-WINDOW chunking → index + retrieval

In [10]:
from llama_index.core.node_parser import SentenceWindowNodeParser

sentwin_parser = SentenceWindowNodeParser(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
sentence_nodes = sentwin_parser.get_nodes_from_documents([doc])
sentence_index = VectorStoreIndex(sentence_nodes)

print("SENTENCE-WINDOW stats:", stats(sentence_nodes))
metrics_sentence = run_retrieval(sentence_index, sentence_nodes, QUERY, k=5, label="SENTENCE-WINDOW")
metrics_sentence


SENTENCE-WINDOW stats: {'n': 12453, 'avg_len': 89.6}

---- SENTENCE-WINDOW----
query vec shape: (384,) | first 8: [-0.0041, 0.0067, -0.0163, 0.0026, -0.0501, 0.0021, -0.0269, -0.0752]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.5126,0.5126,47,here comes two of the house of the Montagues.
1,2,0.4763,0.4763,35,WARWICK: And I the house of York.
2,3,0.4599,0.4599,41,"As I remember, this should be the house."
3,4,0.4565,0.4565,21,ROMEO: Whose house?
4,5,0.4253,0.4253,148,"GLOUCESTER: Two of thy name, both Dukes of Som..."


{'technique': 'SENTENCE-WINDOW',
 'top1_cosine': 0.5126,
 'mean@k': 0.4661,
 'k': 5,
 'num_chunks': 12453,
 'avg_chunk_len': 89.6,
 'latency_ms': 355.13}

Step 9 - compare metrics

In [11]:
cmp_df = pd.DataFrame([metrics_token, metrics_semantic, metrics_sentence])
display(cmp_df[["technique","top1_cosine","mean@k","num_chunks","avg_chunk_len","latency_ms"]])


Unnamed: 0,technique,top1_cosine,mean@k,num_chunks,avg_chunk_len,latency_ms
0,TOKEN,0.3252,0.3105,1480,935.4,47.75
1,SEMANTIC,0.3535,0.31,624,1787.5,27.58
2,SENTENCE-WINDOW,0.5126,0.4661,12453,89.6,355.13


Step 10 - queries for stronger comparison

In [12]:
EXTRA_QUERIES = [
    "Who is Romeo in love with?",
    "Which play contains the line 'To be, or not to be'?"
]
extra_rows = []
for q in EXTRA_QUERIES:
    for (idx, nodes, label) in [
        (token_index, token_nodes, "TOKEN"),
        (semantic_index, semantic_nodes, "SEMANTIC"),
        (sentence_index, sentence_nodes, "SENTENCE-WINDOW"),
    ]:
        m = run_retrieval(idx, nodes, q, k=5, label=f"{label} | {q}")
        m["query"] = q
        extra_rows.append(m)
extra_df = pd.DataFrame(extra_rows)
display(extra_df[["query","technique","top1_cosine","mean@k","latency_ms"]])



---- TOKEN | Who is Romeo in love with?----
query vec shape: (384,) | first 8: [-0.0867, 0.01, 0.0607, 0.0203, -0.0117, 0.0436, 0.1227, 0.0559]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.6413,0.6413,915,"he must complain, And she steal love's sweet b..."
1,2,0.6054,0.6054,964,name: if he be married. My grave is like to be...
2,3,0.5999,0.5999,927,"why, no. But sadly tell me who. ROMEO: Bid a s..."
3,4,0.5784,0.5784,850,me oft for loving Rosaline. FRIAR LAURENCE: Fo...
4,5,0.5753,0.5753,863,would as willingly give cure as know. BENVOLIO...



---- SEMANTIC | Who is Romeo in love with?----
query vec shape: (384,) | first 8: [-0.0867, 0.01, 0.0607, 0.0203, -0.0117, 0.0436, 0.1227, 0.0559]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.7522,0.7522,28,ROMEO: What is her mother?
1,2,0.6388,0.6388,610,"ROMEO: One, gentlewoman, that God hath made fo..."
2,3,0.6333,0.6333,465,"ROMEO: Ay, if I know the letters and the langu..."
3,4,0.5749,0.5749,67,"wast thou with Rosaline? ROMEO: With Rosaline,..."
4,5,0.5483,0.5483,237,I wonder at this haste; that I must wed Ere he...



---- SENTENCE-WINDOW | Who is Romeo in love with?----
query vec shape: (384,) | first 8: [-0.0867, 0.01, 0.0607, 0.0203, -0.0117, 0.0436, 0.1227, 0.0559]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.8024,0.8024,17,ROMEO: Whither?
1,2,0.7949,0.7949,47,"ROMEO: Out of her favour, where I am in love."
2,3,0.7853,0.7853,42,"ROMEO: Why, such is love's transgression."
3,4,0.7833,0.7833,21,Where's Romeo's man?
4,5,0.7802,0.7802,22,ROMEO: Is it even so?



---- TOKEN | Which play contains the line 'To be, or not to be'?----
query vec shape: (384,) | first 8: [-0.0099, 0.0522, -0.0423, -0.0462, 0.0166, 0.1221, 0.0848, -0.0462]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.4105,0.4105,870,more by crossing their high will. First Musici...
1,2,0.4059,0.4059,993,woes We cannot without circumstance descry. Se...
2,3,0.4029,0.4029,993,means not to be found. ROMEO: He jests at scar...
3,4,0.4017,0.4017,836,"begone; the sport is at the best. ROMEO: Ay, s..."
4,5,0.396,0.396,890,"the sun exhales, To be to thee this night a to..."



---- SEMANTIC | Which play contains the line 'To be, or not to be'?----
query vec shape: (384,) | first 8: [-0.0099, 0.0522, -0.0423, -0.0462, 0.0166, 0.1221, 0.0848, -0.0462]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.3885,0.3885,977,O rude unthankfulness! Thy fault our law calls...
1,2,0.3764,0.3764,6739,Bear hence this body and attend our will: Merc...
2,3,0.3665,0.3665,47,SICINIUS: Tell not me: I know this cannot be.
3,4,0.366,0.366,2944,"God join'd my heart and Romeo's, thou our hand..."
4,5,0.3595,0.3595,219,"TRANIO: Sir, I shall not be slack: in sign whe..."



---- SENTENCE-WINDOW | Which play contains the line 'To be, or not to be'?----
query vec shape: (384,) | first 8: [-0.0099, 0.0522, -0.0423, -0.0462, 0.0166, 0.1221, 0.0848, -0.0462]
doc matrix shape: (5, 384)


Unnamed: 0,rank,store_score,cosine_sim,chunk_len,preview
0,1,0.5407,0.5407,32,JULIET: What must be shall be.
1,2,0.4852,0.4852,39,JULIET: Speakest thou from thy heart?
2,3,0.4806,0.4806,48,"JULIET: It is, it is: hie hence, be gone, away!"
3,4,0.4783,0.4783,58,PERDITA: I see the play so lies That I must be...
4,5,0.4651,0.4651,53,JULIET: What satisfaction canst thou have to-n...


Unnamed: 0,query,technique,top1_cosine,mean@k,latency_ms
0,Who is Romeo in love with?,TOKEN | Who is Romeo in love with?,0.6413,0.6001,52.69
1,Who is Romeo in love with?,SEMANTIC | Who is Romeo in love with?,0.7522,0.6295,29.04
2,Who is Romeo in love with?,SENTENCE-WINDOW | Who is Romeo in love with?,0.8024,0.7892,333.55
3,"Which play contains the line 'To be, or not to...","TOKEN | Which play contains the line 'To be, o...",0.4105,0.4034,42.11
4,"Which play contains the line 'To be, or not to...",SEMANTIC | Which play contains the line 'To be...,0.3885,0.3714,23.83
5,"Which play contains the line 'To be, or not to...",SENTENCE-WINDOW | Which play contains the line...,0.5407,0.49,253.36
