In [1]:
import re

# Load the book's text
with open('data/TKMBFullBook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [7]:
chapters = re.split(r'Chapter \d+', text)
if chapters[0].strip() == "":
    chapters.pop(0) # Remove the empty string before the first chapter

In [21]:
WINDOW = 200 # Number of words in each chunk # 80
STRIDE = 40 # Number of words to shift for the next chunk # 40

def chapter_to_chunks(chapter_text):
    # Simple word splitting (preserving original text for transformer tokenizer)
    words = chapter_text.split()
    chunks = []
    
    for start in range(0, len(words), STRIDE):
        end = start + WINDOW
        if end > len(words):
            # For the last chunk, take all remaining words
            chunk_words = words[start:]
        else:
            chunk_words = words[start:end]
        
        # Join the words back into text with single spaces
        chunk_text = " ".join(chunk_words)
        chunks.append(chunk_text)
    
    return chunks

all_chunks = []
for chap_num, chap_text in enumerate(chapters, start=1):
    for i, chunk in enumerate(chapter_to_chunks(chap_text), start=1):
        all_chunks.append({
            "chapter": chap_num,
            "chunk_id": f"{chap_num}_{i}",
            "text": chunk
        })
        
print(f"Total chunks created: {len(all_chunks)}")
print(f"Example chunk: {all_chunks[500]}")

Total chunks created: 2508
Example chunk: {'chapter': 6, 'chunk_id': '6_66', 'text': 'white shirt bobbed over the back fence and slowly grew larger. He came up the back steps, latched the door behind him, and sat on his cot. Wordlessly, he held up his pants. He lay down, and for a while I heard his cot trembling. Soon he was still. I did not hear him stir again.'}


In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

W0804 23:49:31.795000 56288 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [23]:
texts = [chunk['text'] for chunk in all_chunks]
embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [24]:
embeddings[1]  # Example to show the embedding of the second chunk

array([ 5.39507857e-03,  5.76998964e-02,  3.04610394e-02, -8.37456435e-03,
       -1.27350660e-02,  1.36908144e-02, -2.28492673e-02,  3.03966319e-03,
       -2.67136414e-02,  6.41480014e-02,  1.07765652e-03,  2.55825389e-02,
        2.84903776e-02, -6.11360520e-02, -4.73947078e-02, -1.52217336e-02,
       -3.97432894e-02, -3.06693055e-02, -7.92617574e-02,  5.51033951e-02,
       -6.05724119e-02,  6.33465126e-02,  8.25765952e-02,  6.36205673e-02,
       -4.81975190e-02, -8.79382435e-03,  1.59304570e-02,  1.29816458e-02,
       -1.57130733e-02, -8.80664140e-02, -1.52241974e-03, -7.32340990e-03,
        1.17485570e-02, -4.13204655e-02, -7.11574331e-02,  9.70822666e-03,
        1.02118514e-01,  2.02409048e-02,  2.82099973e-02, -8.54067281e-02,
        3.70315723e-02, -2.14800537e-02,  4.22092304e-02, -2.29711756e-02,
       -1.12135760e-01,  4.83590439e-02,  3.79649177e-02,  5.50652742e-02,
        2.16449481e-02,  3.41635803e-03,  3.63376252e-02,  1.32745923e-02,
        4.11742106e-02, -

In [25]:
embeddings.shape

(2508, 384)

In [None]:
embeddings = []

for chunk in all_chunks:
    embedding = model.encode(chunk['text'], convert_to_tensor=True)
    embeddings.append({
        "chunk_id": chunk['chunk_id'],
        "embedding": embedding
    })