In [48]:
import re

# Load the book's text
with open('data/TKMBFullBook.txt', 'r', encoding='utf-8') as f:
    text = f.read()


chapters = re.split(r'Chapter \d+', text)
if chapters[0].strip() == "":
    chapters.pop(0) # Remove the empty string before the first chapter

In [98]:
from nltk.tokenize import word_tokenize

WINDOW = 80 # Number of tokens in each chunk
STRIDE = 20 # Number of tokens to shift for the next chunk

def chapter_to_chunks(chapter_text):
    tokens = word_tokenize(chapter_text)
    chunks = []
    for start in range(0, len(tokens), STRIDE):
        end = start + WINDOW
        if end > len(tokens):
            break
        chunk_tokens = tokens[start:end]
        chunks.append(" ".join(chunk_tokens))
    return chunks

all_chunks = []
for chap_num, chap_text in enumerate(chapters, start=1):
    for i, chunk in enumerate(chapter_to_chunks(chap_text), start=1):
        all_chunks.append({
            "chapter": chap_num,
            "chunk_id": f"{chap_num}_{i}",
            "text": chunk
        })
print(f"Total chunks created: {len(all_chunks)}")
print(f"Example chunk: {all_chunks[500]}")
#print(chapters[0][:448])  # Print the first 500 characters of the first chapter for context
#print(f"Total chapters: {len(chapters)}")

Total chunks created: 6318
Example chunk: {'chapter': 3, 'chunk_id': '3_63', 'text': 'fault . “ Hush your fussin ‘ , ” she said . Jem and Walter returned to school ahead of me : staying behind to advise Atticus of Calpurnia ’ s iniquities was worth a solitary sprint past the Radley Place . “ She likes Jem better ’ n she likes me , anyway , ” I concluded , and suggested that Atticus lose no time in packing her off . “ Have you ever considered that Jem doesn ’'}


In [None]:
#TODO: Analyze the distribution of chunk lengths

In [88]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> list[str]:
    
      #normalizing curly apostrophes
    text = text.replace("’", "'").replace("‘", "'")
    # Removing multi-dots to a single space (...)
    text = re.sub(r'\.{2,}', ' ', text)
    # Replacing hyphens/slashes with spaces (so “cross-road” → “cross road”)
    text = re.sub(r'[-/]', ' ', text)
    # Removing all punctuation except apostrophes
    # Keeping letters, numbers, whitespace, and apostrophes for contractions/possessives
    text = re.sub(r"[^\w\s']+", '', text)
    # Normalizing whitespace and lowercasing
    text = re.sub(r'\s+', ' ', text).strip().lower()
    
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency
    # Lemmatize each word
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Return the list of lemmatized tokens
    return lemmatized_tokens

In [89]:
from rank_bm25 import BM25Okapi

# Create a list of lemmatized texts for BM25
Corpus = [lemmatize_text(chunk['text']) for chunk in all_chunks]

# Initialize BM25 with the corpus for ranking and retrieval
bm25 = BM25Okapi(Corpus)

In [90]:
query = "The summer sun hung low over Maycomb, casting long shadows through the dusty streets and swaying branches of ancient oaks. Jem and Scout Finch sat on the porch steps, their bare feet dangling, watching the world with wide, curious eyes. It was the kind of afternoon where time seemed to slow, yet the air was thick with a tension they couldn’t quite name.\nTommy Harris was new in town, a shy boy with a crooked smile who kept to himself. One afternoon, a window shattered at the church, and fingers quickly pointed toward Tommy. Rumors swirled — some said he was clumsy, others whispered darker things. But Jem and Scout, knowing the boy’s quiet kindness, couldn’t believe he was guilty of such mischief.\nAtticus, their father, sat stiffly at the kitchen table, the weight of the town’s silent judgments pressing down on him. “Sometimes,” he said, “people see what they expect to see, not what is true.” His voice was calm, but Scout caught the flicker of sadness in his eyes.\nThe trial was a schoolyard spectacle. Tommy, nervous and small against the towering accusations, tried to explain, but his innocence was swallowed by suspicion. The town’s fear painted him in shades of guilt, a stark contrast to the gentle boy Jem and Scout knew.\nIn the end, it wasn’t the truth that won, but the loudest voices in the room. Jem and Scout learned that innocence, like a fragile flower, can be mistaken for something else entirely — a shadow cast longer by fear and misunderstanding. And in that small town, where the line between right and wrong blurred with the setting sun, the children held tight to the hope that one day, truth would find its way back to the light."
processed_query = lemmatize_text(query)

doc_scores = bm25.get_scores(processed_query)
doc_scores

array([231.32219414, 226.17348131, 221.04755136, ..., 211.13537439,
       182.32492176, 188.50500111])

In [91]:
doc_scores = bm25.get_scores(processed_query)

# Get the top 2 indices (not the token lists)
top_indices = doc_scores.argsort()[-2:][::-1]  # Get indices of top 2 scores in descending order

print(f"Found top 2 matching chunks:\n")

for rank, idx in enumerate(top_indices):
    chunk = all_chunks[idx]
    bm25_score = doc_scores[idx]
    
    print(f"Rank {rank + 1}:")
    print(f"'chapter': {chunk['chapter']}, 'paragraph_Id': {chunk['chunk_id']}, 'bm25_score': {bm25_score:.4f}")
    print(f"'text': {chunk['text']}")
    print("-" * 80)

Found top 2 matching chunks:

Rank 1:
'chapter': 6, 'paragraph_Id': 6_29, 'bm25_score': 287.5896
'text': then tried his weight by degrees . The step was silent . Jem skipped two steps , put his foot on the porch , heaved himself to it , and teetered a long moment . He regained his balance and dropped to his knees . He crawled to the window , raised his head and looked in . Then I saw the shadow . It was the shadow of a man with a hat on . At first I thought it was a tree , but there was no wind blowing , and tree-trunks never walked . The back porch was bathed in moonlight , and the shadow , crisp as toast , moved across the porch toward Jem . Dill saw it next . He put his hands to his face . When it crossed Jem , Jem saw it . He put his arms over his head and
--------------------------------------------------------------------------------
Rank 2:
'chapter': 26, 'paragraph_Id': 26_13, 'bm25_score': 281.4191
'text': let us know he knew a lot more about something than we thought he knew 

Evalute usinng precision @ K

Addiotional Evalution and add some stats to the data