In [1]:
import re

# Load the book's text
with open('data/TKMBFullBook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
chapters = re.split(r'Chapter \d+', text)
if chapters[0].strip() == "":
    chapters.pop(0) # Remove the empty string before the first chapter

In [None]:
WINDOW = 384  # Number of words in each chunk 
STRIDE = 60 # Number of words to shift for the next chunk 

def chapter_to_chunks(chapter_text):
    # Simple word splitting (preserving original text for transformer tokenizer)
    words = chapter_text.split()
    chunks = []
    
    for start in range(0, len(words), STRIDE):
        end = start + WINDOW
        if end > len(words):
            # For the last chunk, take all remaining words
            chunk_words = words[start:]
        else:
            chunk_words = words[start:end]
        
        # Join the words back into text with single spaces
        chunk_text = " ".join(chunk_words)
        chunks.append(chunk_text)
    
    return chunks

all_chunks = []
for chap_num, chap_text in enumerate(chapters, start=1):
    for i, chunk in enumerate(chapter_to_chunks(chap_text), start=1):
        all_chunks.append({
            "chapter": chap_num,
            "chunk_id": f"{chap_num}_{i}",
            "text": chunk
        })
        
print(f"Total chunks created: {len(all_chunks)}")
print(f"Example chunk: {all_chunks[50]}")

Total chunks created: 1673
Example chunk: {'chapter': 1, 'chunk_id': '1_51', 'text': 'the Radleys: when Jem would question him Atticus’s only answer was for him to mind his own business and let the Radleys mind theirs, they had a right to; but when it happened Jem said Atticus shook his head and said, “Mm, mm, mm.” So Jem received most of his information from Miss Stephanie Crawford, a neighborhood scold, who said she knew the whole thing. According to Miss Stephanie, Boo was sitting in the livingroom cutting some items from The Maycomb Tribune to paste in his scrapbook. His father entered the room. As Mr. Radley passed by, Boo drove the scissors into his parent’s leg, pulled them out, wiped them on his pants, and resumed his activities. Mrs. Radley ran screaming into the street that Arthur was killing them all, but when the sheriff arrived he found Boo still sitting in the livingroom, cutting up the Tribune. He was thirty-three years old then. Miss Stephanie said old Mr. Radley said n

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

W0813 12:34:00.765000 19964 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [5]:
texts = [chunk['text'] for chunk in all_chunks]
embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [6]:
embeddings[1]  # Example to show the embedding of the second chunk

array([-1.10698612e-02,  2.98842490e-02,  2.10520942e-02, -2.23328304e-02,
       -1.47477062e-02,  2.19944287e-02, -5.79456706e-03, -2.03735996e-02,
       -3.41824368e-02, -3.51069979e-02,  3.03402339e-04,  6.35926723e-02,
        5.87773584e-02, -4.38749380e-02, -4.03507203e-02,  6.90658838e-02,
       -2.00750642e-02,  3.67354229e-02,  4.01172936e-02,  2.88017523e-02,
       -2.92388611e-02,  2.72633303e-02,  1.68018900e-02, -4.06044833e-02,
        1.24957960e-03,  2.80717164e-02,  3.14317527e-03,  3.81991081e-02,
       -1.33653833e-02, -3.70142870e-02, -2.30525732e-02, -1.21566392e-02,
       -5.28830141e-02, -5.90099841e-02,  2.55009900e-06,  1.47656482e-02,
        1.34951491e-02, -3.01998891e-02, -3.48994806e-02,  1.83912721e-02,
        8.74403343e-02,  6.55653775e-02,  7.53659522e-03, -6.02023909e-03,
       -5.96989468e-02, -1.26684252e-02, -4.99622189e-02,  6.23056144e-02,
       -1.01680048e-02,  3.44442390e-02, -5.47501550e-05,  1.39420209e-02,
        7.90802017e-03,  

In [7]:
embeddings.shape

(1673, 768)

Indexing the data with FIASS

In [None]:
import faiss
import numpy as np

# Convert to float32 ( required for FAISS)
embedding_matrix = np.array(embeddings).astype('float32')

# Build index (using cosine)
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine similarity (after normalization)

# Normalize for cosine similarity
faiss.normalize_L2(embedding_matrix)

# Add vectors
index.add(embedding_matrix)


In [9]:
def search_faiss(query, top_k=5):
    # Encode and normalize query
    query_vec = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_vec)

    # Search index
    scores, indices = index.search(query_vec, top_k)

    # Retrieve the corresponding chunks
    results = []
    for i, score in zip(indices[0], scores[0]):
        result = {
            "chunk_id": all_chunks[i]["chunk_id"],
            "chapter": all_chunks[i]["chapter"],
            "text": all_chunks[i]["text"],
            "score": float(score)
        }
        results.append(result)
    
    return results


LLM to Query

In [None]:
import openai
from openai import OpenAI
import os

#  OpenAI API key

client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY")) #Using environmental variable

#generate text
def generate_text(prompt: str) -> str:
    """Generates text using the OpenAI API based on the provided prompt."""
    
    # Make the API call
    response = client.chat.completions.create(
        model="gpt-4.1-mini",  
        messages=[{"role": "system", "content": "You are a creative short story writer who writes stories in a specified book style and answer questions based on it."},
                  {"role": "user", "content": prompt}],
        temperature=0.7,  # Adjust temperature for creativity
    )
    
    return response.choices[0].message.content.strip()

In [11]:
generated_text = generate_text("In 200 words, write a short story in the style of 'To Kill a Mockingbird' about a summer in manchester, UK.")
print(generated_text)

That summer in Manchester was unlike any other. The rain, usually a constant companion, took a rare holiday, leaving the cobbled streets gleaming under a shy sun. I was ten, full of restless energy and the kind of curiosity that leads children into both trouble and wonder.

Our neighborhood was a patchwork of red-brick terraces, where families gathered on doorsteps as the evenings cooled. Mr. Thompson, the old man who lived at the end of the lane, told stories of the cotton mills that once roared louder than the city itself. His voice carried a mix of pride and sorrow, and we listened as if those tales were the only magic in the world.

One afternoon, my friend Ellie and I found a stray cat hiding behind a bin. She was thin and wary, but Ellie’s gentle hands won her trust. We named her Daisy and promised to care for her through the summer’s end. Daisy became our secret, a tiny rebellion against the grey monotony.

That summer taught me about kindness in unexpected places, the weight of

In [12]:
query = generated_text
top_results = search_faiss(query, top_k=10)

for res in top_results:
    print(f"Chapter {res['chapter']} | Chunk {res['chunk_id']} | Score: {res['score']:.3f}")
    print(res['text'])
    print("---")


Chapter 31 | Chunk 31_12 | Score: 0.725
sad. I turned to go home. Street lights winked down the street all the way to town. I had never seen our neighborhood from this angle. There were Miss Maudie’s, Miss Stephanie’s—there was our house, I could see the porch swing—Miss Rachel’s house was beyond us, plainly visible. I could even see Mrs. Dubose’s. I looked behind me. To the left of the brown door was a long shuttered window. I walked to it, stood in front of it, and turned around. In daylight, I thought, you could see to the postoffice corner. Daylight… in my mind, the night faded. It was daytime and the neighborhood was busy. Miss Stephanie Crawford crossed the street to tell the latest to Miss Rachel. Miss Maudie bent over her azaleas. It was summertime, and two children scampered down the sidewalk toward a man approaching in the distance. The man waved, and the children raced each other to him. It was still summertime, and the children came closer. A boy trudged down the sidewalk d

Cross Encoder Reranking with FAISS

In [13]:
from sentence_transformers import CrossEncoder
_reranker = None

def get_reranker():
    global _reranker
    if _reranker is None:
        _reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    return _reranker

In [14]:
# FAISS -> Cross-Encoder re-ranking
# topN: how many candidates to pull from FAISS before re-ranking
# final_topk: how many to return after re-ranking

from collections import defaultdict

def search_faiss_rerank(query, topN=100, final_topk=10, group_by_chapter=True, per_chapter=3, batch_size=32):
    # 1) Retrieve candidates with FAISS
    import faiss
    q = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, topN)

    candidates = []
    for j, i in enumerate(idxs[0]):
        candidates.append({
            "chunk_id": all_chunks[i]["chunk_id"],
            "chapter": all_chunks[i]["chapter"],
            "text": all_chunks[i]["text"],
            "retrieval_score": float(scores[0][j]),
            "_idx": int(i)
        })

    if not candidates:
        return []
      # 2) Re-rank with cross-encoder
    ce = get_reranker()
    pairs = [(query, c["text"]) for c in candidates]
    ce_scores = ce.predict(pairs, batch_size=batch_size)
    for c, s in zip(candidates, ce_scores):
        c["ce_score"] = float(s)

    # 3) Sort by CE score and slice
    candidates.sort(key=lambda x: x["ce_score"], reverse=True)
    top = candidates[:final_topk]

    if not group_by_chapter:
        return top

    # 4) chapter-aware grouping
    buckets = defaultdict(list)
    
    for c in top:
        buckets[c["chapter"]].append(c)

    cards = []
    for chap, items in buckets.items():
        items.sort(key=lambda x: x["ce_score"], reverse=True)
        chapter_score = max(i["ce_score"] for i in items)
        cards.append({
            "chapter": chap,
            "chapter_score": chapter_score,
            "chunks": items[:per_chapter]
        })

    cards.sort(key=lambda x: x["chapter_score"], reverse=True)
    return cards


In [15]:
Rerank_results = search_faiss_rerank(generated_text, topN=100, final_topk=10, group_by_chapter=True, per_chapter=3, batch_size=32)

In [None]:
#search_faiss_rerank
# Sort the results by chapter score the 
Rerank_results.sort(key=lambda x: x['chapter_score'], reverse=True)

In [19]:
Rerank_results

[{'chapter': 31,
  'chapter_score': -6.439411163330078,
  'chunks': [{'chunk_id': '31_12',
    'chapter': 31,
    'text': 'sad. I turned to go home. Street lights winked down the street all the way to town. I had never seen our neighborhood from this angle. There were Miss Maudie’s, Miss Stephanie’s—there was our house, I could see the porch swing—Miss Rachel’s house was beyond us, plainly visible. I could even see Mrs. Dubose’s. I looked behind me. To the left of the brown door was a long shuttered window. I walked to it, stood in front of it, and turned around. In daylight, I thought, you could see to the postoffice corner. Daylight… in my mind, the night faded. It was daytime and the neighborhood was busy. Miss Stephanie Crawford crossed the street to tell the latest to Miss Rachel. Miss Maudie bent over her azaleas. It was summertime, and two children scampered down the sidewalk toward a man approaching in the distance. The man waved, and the children raced each other to him. It wa

Evaluation

In [None]:
#precision@k
def precision_at_k(results, k):
    """Calculates precision at k for the given results."""
    if not results or k <= 0:
        return 0.0
    
    relevant_count = sum(1 for res in results[:k] if res['retrieval_score'] > 0.5)  # threshold for relevance
    return relevant_count / k

In [None]:
# Flatten the chunks from the grouped results
def precision_at_k_no_rerank(results, k):
	"""Calculates precision at k for the given results (no rerank)."""
	if not results or k <= 0:
		return 0.0
	relevant_count = sum(1 for res in results[:k] if res['score'] > 0.5)  
	return relevant_count / k

pk_no_rerank = precision_at_k_no_rerank(top_results, k=10)
print(f"Precision at k=10: {pk_no_rerank:.2f}")

Precision at k=10: 1.00


In [None]:
#recall@k
def recall_at_k_no_rerank(results, k):
    """Calculates recall at k for the given results."""
    if not results or k <= 0:
        return 0.0
    
    relevant_count = sum(1 for res in results[:k] if res['score'] > 0.5)  # threshold for relevance
    total_relevant = sum(1 for res in results if res['score'] > 0.5)
    
    return relevant_count / total_relevant if total_relevant > 0 else 0.0

recall_no_rerank = recall_at_k_no_rerank(top_results, k=10)
print(f"Recall at k=10: {recall_no_rerank:.2f}")

Recall at k=10: 1.00


1.0