In [32]:
import re

# Load the book's text
with open('data/TKMBFullBook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [33]:
chapters = re.split(r'Chapter \d+', text)
if chapters[0].strip() == "":
    chapters.pop(0) # Remove the empty string before the first chapter

In [44]:
WINDOW = 256 # Number of words in each chunk # 200
STRIDE = 60 # Number of words to shift for the next chunk # 40

def chapter_to_chunks(chapter_text):
    # Simple word splitting (preserving original text for transformer tokenizer)
    words = chapter_text.split()
    chunks = []
    
    for start in range(0, len(words), STRIDE):
        end = start + WINDOW
        if end > len(words):
            # For the last chunk, take all remaining words
            chunk_words = words[start:]
        else:
            chunk_words = words[start:end]
        
        # Join the words back into text with single spaces
        chunk_text = " ".join(chunk_words)
        chunks.append(chunk_text)
    
    return chunks

all_chunks = []
for chap_num, chap_text in enumerate(chapters, start=1):
    for i, chunk in enumerate(chapter_to_chunks(chap_text), start=1):
        all_chunks.append({
            "chapter": chap_num,
            "chunk_id": f"{chap_num}_{i}",
            "text": chunk
        })
        
print(f"Total chunks created: {len(all_chunks)}")
print(f"Example chunk: {all_chunks[50]}")

Total chunks created: 1673
Example chunk: {'chapter': 1, 'chunk_id': '1_51', 'text': 'the Radleys: when Jem would question him Atticus’s only answer was for him to mind his own business and let the Radleys mind theirs, they had a right to; but when it happened Jem said Atticus shook his head and said, “Mm, mm, mm.” So Jem received most of his information from Miss Stephanie Crawford, a neighborhood scold, who said she knew the whole thing. According to Miss Stephanie, Boo was sitting in the livingroom cutting some items from The Maycomb Tribune to paste in his scrapbook. His father entered the room. As Mr. Radley passed by, Boo drove the scissors into his parent’s leg, pulled them out, wiped them on his pants, and resumed his activities. Mrs. Radley ran screaming into the street that Arthur was killing them all, but when the sheriff arrived he found Boo still sitting in the livingroom, cutting up the Tribune. He was thirty-three years old then. Miss Stephanie said old Mr. Radley said n

In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

W0804 23:49:31.795000 56288 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [45]:
texts = [chunk['text'] for chunk in all_chunks]
embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [46]:
embeddings[1]  # Example to show the embedding of the second chunk

array([ 1.29837347e-02,  2.70487983e-02,  1.01330662e-02, -2.32675783e-02,
       -2.85031646e-02, -1.84757262e-03,  3.32422159e-03, -6.01326972e-02,
       -7.65829757e-02,  8.46346691e-02, -3.42197642e-02,  6.32861350e-03,
        1.23412125e-02, -3.87306400e-02, -5.35401441e-02,  2.31998065e-03,
       -2.53848527e-02, -3.59485149e-02, -6.17510676e-02,  6.63554892e-02,
       -2.08959579e-02,  5.19076809e-02,  1.17372595e-01,  4.66499142e-02,
       -5.53927906e-02,  2.19413899e-02,  2.28139944e-02,  3.22447009e-02,
       -2.49795243e-02, -8.12080055e-02, -2.15766812e-03, -3.51343192e-02,
       -1.66762020e-05, -3.63159776e-02, -6.79372251e-02,  5.38275484e-03,
        9.28241611e-02,  2.17067283e-02,  2.98103094e-02, -8.76560062e-02,
        3.24136429e-02,  5.97578194e-03,  2.91927345e-03,  6.26275782e-03,
       -9.53449085e-02,  3.64794512e-03,  2.27572955e-02,  1.36787798e-02,
        3.97166088e-02, -1.55850304e-02,  2.19276790e-02,  4.73944694e-02,
        2.57588085e-02, -

In [47]:
embeddings.shape

(1673, 384)

Indexing the data with FIASS

In [48]:
import faiss
import numpy as np

# Convert to float32 (FAISS requires it)
embedding_matrix = np.array(embeddings).astype('float32')

# Build index (L2 or cosine - we'll use cosine)
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine similarity (after normalization)

# Normalize for cosine similarity
faiss.normalize_L2(embedding_matrix)

# Add vectors
index.add(embedding_matrix)


In [49]:
def search_faiss(query, top_k=5):
    # Encode and normalize query
    query_vec = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_vec)

    # Search index
    scores, indices = index.search(query_vec, top_k)

    # Retrieve the corresponding chunks
    results = []
    for i, score in zip(indices[0], scores[0]):
        result = {
            "chunk_id": all_chunks[i]["chunk_id"],
            "chapter": all_chunks[i]["chapter"],
            "text": all_chunks[i]["text"],
            "score": float(score)
        }
        results.append(result)
    
    return results


LLM to Query

In [29]:
import openai
from openai import OpenAI
import os

# Set up OpenAI API key
#openai.api_key = os.getenv("OPENAI_API_KEY")  # Ensure you have set this environment variable

client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY"))

#generate text
def generate_text(prompt: str) -> str:
    """Generates text using the OpenAI API based on the provided prompt."""
    
    # Make the API call
    response = client.chat.completions.create(
        model="gpt-4.1-mini",  # A powerful and cost-effective choice
        messages=[{"role": "system", "content": "You are a creative short story writer who writes stories in a specified book style and answer questions based on it."},
                  {"role": "user", "content": prompt}],
        temperature=0.7,  # Adjust temperature for creativity
    )
    
    return response.choices[0].message.content.strip()

In [30]:
generated_text = generate_text("In 200 words, write a short story in the style of 'To Kill a Mockingbird' about a summer in manchester, UK.")
print(generated_text)

That summer in Manchester was unlike any other. The rain, usually relentless, took a brief pause, letting the sun cast a hesitant glow over the red-brick terraces and cobbled streets. I was ten, the age when the world feels vast but the heart still small enough to be scared of shadows.

My brother, Jamie, and I spent afternoons beneath the ancient chestnut tree in our garden, pretending we were explorers charting unknown lands. Mum called it “our little jungle,” and Dad, ever the storyteller, spun tales of cotton mills and workers’ struggles, threading history into our play.

One day, a new boy arrived in the neighborhood — a quiet lad named Sam with eyes like the grey skies above. He carried a battered football and a secret smile. We invited him under the chestnut tree, where stories and laughter blossomed like the spring blooms we’d missed.

But summer in Manchester isn’t just sunshine and friendship. The factories still hummed, the streets still whispered of hardship. Yet, amidst it

In [50]:
query = generated_text
top_results = search_faiss(query, top_k=5)

for res in top_results:
    print(f"Chapter {res['chapter']} | Chunk {res['chunk_id']} | Score: {res['score']:.3f}")
    print(res['text'])
    print("---")


Chapter 31 | Chunk 31_15 | Score: 0.596
his hands on his hips. Summertime, and his children played in the front yard with their friend, enacting a strange little drama of their own invention. It was fall, and his children fought on the sidewalk in front of Mrs. Dubose’s. The boy helped his sister to her feet, and they made their way home. Fall, and his children trotted to and fro around the corner, the day’s woes and triumphs on their faces. They stopped at an oak tree, delighted, puzzled, apprehensive. Winter, and his children shivered at the front gate, silhouetted against a blazing house. Winter, and a man walked into the street, dropped his glasses, and shot a dog. Summer, and he watched his children’s heart break. Autumn again, and Boo’s children needed him. Atticus was right. One time he said you never really know a man until you stand in his shoes and walk around in them. Just standing on the Radley porch was enough. The street lights were fuzzy from the fine rain that was falli

Reranker

In [51]:
from sentence_transformers import CrossEncoder
_reranker = None

def get_reranker():
    global _reranker
    if _reranker is None:
        _reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    return _reranker

In [60]:
# FAISS -> Cross-Encoder re-ranking
# topN: how many candidates to pull from FAISS before re-ranking
# final_topk: how many to return after re-ranking

from collections import defaultdict

def search_faiss_rerank(query, topN=100, final_topk=10, group_by_chapter=False, per_chapter=3, batch_size=32):
    # 1) Retrieve candidates with FAISS
    import faiss
    q = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, topN)

    candidates = []
    for j, i in enumerate(idxs[0]):
        candidates.append({
            "chunk_id": all_chunks[i]["chunk_id"],
            "chapter": all_chunks[i]["chapter"],
            "text": all_chunks[i]["text"],
            "retrieval_score": float(scores[0][j]),
            "_idx": int(i)
        })

    if not candidates:
        return []
      # 2) Re-rank with cross-encoder
    ce = get_reranker()
    pairs = [(query, c["text"]) for c in candidates]
    ce_scores = ce.predict(pairs, batch_size=batch_size)
    for c, s in zip(candidates, ce_scores):
        c["ce_score"] = float(s)

    # 3) Sort by CE score and slice
    candidates.sort(key=lambda x: x["ce_score"], reverse=True)
    top = candidates[:final_topk]

    if not group_by_chapter:
        return top

    # 4) Optional: chapter-aware grouping
    buckets = defaultdict(list)
    for c in top:
        buckets[c["chapter"]].append(c)

    cards = []
    for chap, items in buckets.items():
        items.sort(key=lambda x: x["ce_score"], reverse=True)
        chapter_score = max(i["ce_score"] for i in items)
        cards.append({
            "chapter": chap,
            "chapter_score": chapter_score,
            "chunks": items[:per_chapter]
        })

    cards.sort(key=lambda x: x["chapter_score"], reverse=True)
    return cards


In [61]:
search_faiss_rerank = search_faiss_rerank(generated_text, topN=100, final_topk=10, group_by_chapter=True, per_chapter=3, batch_size=32)

In [62]:
search_faiss_rerank

[{'chapter': 21,
  'chapter_score': -4.84617805480957,
  'chunks': [{'chunk_id': '21_24',
    'chapter': 21,
    'text': 'the idea of asking everyone below to concentrate on setting Tom Robinson free, but thought if they were as tired as I, it wouldn’t work. Dill was sound asleep, his head on Jem’s shoulder, and Jem was quiet. “Ain’t it a long time?” I asked him. “Sure is, Scout,” he said happily. “Well, from the way you put it, it’d just take five minutes.” Jem raised his eyebrows. “There are things you don’t understand,” he said, and I was too weary to argue. But I must have been reasonably awake, or I would not have received the impression that was creeping into me. It was not unlike one I had last winter, and I shivered, though the night was hot. The feeling grew until the atmosphere in the courtroom was exactly the same as a cold February morning, when the mockingbirds were still, and the carpenters had stopped hammering on Miss Maudie’s new house, and every wood door in the neigh

Evaluation