In [1]:
!pip install spacy





In [2]:
import fitz  # PyMuPDF
import re
import nltk
from nltk.tokenize import sent_tokenize


In [3]:
import spacy
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

def load_pdf(filepath):
    """Extract text from a PDF and tokenize sentences using spaCy."""
    doc = fitz.open(filepath)
    text = " ".join([page.get_text("text") for page in doc])
    
    # Use spaCy for sentence tokenization
    return [sent.text for sent in nlp(text).sents]

book_sentences = load_pdf("ebookebook.pdf")
print(f"Total Sentences Extracted: {len(book_sentences)}")
print("Sample Sentences:", book_sentences[:5])


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.9 MB/s eta 0:00:07
     ---- ----------------------------------- 1.3/12.8 MB 2.5 MB/s eta 0:00:05
     ---- ----------------------------------- 1.6/12.8 MB 2.3 MB/s eta 0:00:05
     ------ --------------------------------- 2.1/12.8 MB 2.1 MB/s eta 0:00:06
     ------- -------------------------------- 2.4/12.8 MB 2.1 MB/s eta 0:00:05
     --------- ------------------------------ 2.9/12.8 MB 2.2 MB/s eta 0:00:05
     --------- ------------------------------ 2.9/12.8 MB 2.2 MB/s eta 0:00:05
     ---------- ----------------------------- 3.4/12.8 

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model

# Convert book sentences into embeddings
sentence_embeddings = model.encode(book_sentences, convert_to_numpy=True)

# Create FAISS Index for fast retrieval
index = faiss.IndexFlatL2(sentence_embeddings.shape[1])
index.add(sentence_embeddings)

# Save the index and data for later use
faiss.write_index(index, "faiss_book_index.bin")
np.save("book_sentences.npy", book_sentences)


In [5]:
def retrieve_relevant_text(question, top_k=3):
    question_embedding = model.encode([question], convert_to_numpy=True)
    _, indices = index.search(question_embedding, top_k)
    return [book_sentences[i] for i in indices[0]]

question = "What is the main theme of the book?"
retrieved_text = retrieve_relevant_text(question)
print("Retrieved Context:", retrieved_text)


Retrieved Context: ['Let the book’s ideas \nand intended readership guide your thematic choices.\n', 'What’s in it for the reader?\n', 'I’ll explain the introduction and conclusion later in the book.']


In [6]:
!pip install sentence-transformers faiss-cpu




In [7]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load a lightweight sentence embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert book sentences into vector embeddings
sentence_embeddings = embedding_model.encode(book_sentences, convert_to_numpy=True)

# Create a FAISS index for fast similarity search
index = faiss.IndexFlatL2(sentence_embeddings.shape[1])
index.add(sentence_embeddings)

# Save the FAISS index and sentences for later use
faiss.write_index(index, "faiss_book_index.bin")
np.save("book_sentences.npy", np.array(book_sentences))

print("FAISS index and sentence embeddings saved!")


FAISS index and sentence embeddings saved!


In [8]:
def retrieve_relevant_text(question, top_k=3):
    """Retrieve the most relevant sentences from the book for a given question."""
    question_embedding = embedding_model.encode([question], convert_to_numpy=True)
    _, indices = index.search(question_embedding, top_k)  # Retrieve top K closest matches
    return [book_sentences[i] for i in indices[0]]

# Test retrieval system
sample_question = "What is the main theme of the book?"
retrieved_text = retrieve_relevant_text(sample_question)
print("Retrieved Context:", retrieved_text)


Retrieved Context: ['Let the book’s ideas \nand intended readership guide your thematic choices.\n', 'What’s in it for the reader?\n', 'I’ll explain the introduction and conclusion later in the book.']


In [9]:
!pip install sentencepiece




In [10]:
!pip install --upgrade transformers sentencepiece




In [11]:
from transformers import T5Tokenizer

qa_tokenizer = T5Tokenizer.from_pretrained("t5-small")
print("T5 Tokenizer loaded successfully!")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5 Tokenizer loaded successfully!


In [12]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load the pre-trained embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load the saved FAISS index and sentences
index = faiss.read_index("faiss_book_index.bin")
book_sentences = np.load("book_sentences.npy", allow_pickle=True)

def retrieve_relevant_text(question, top_k=3):
    """Retrieve the most relevant sentences from the book for a given question."""
    question_embedding = embedding_model.encode([question], convert_to_numpy=True)
    _, indices = index.search(question_embedding, top_k)  # Find closest sentences
    return [book_sentences[i] for i in indices[0]]

# Example test
sample_question = "What is the main theme of the book?"
retrieved_text = retrieve_relevant_text(sample_question)
print("Retrieved Context:", retrieved_text)


Retrieved Context: [np.str_('Let the book’s ideas \nand intended readership guide your thematic choices.\n'), np.str_('What’s in it for the reader?\n'), np.str_('I’ll explain the introduction and conclusion later in the book.')]


In [13]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5 model and tokenizer
qa_model = T5ForConditionalGeneration.from_pretrained("t5-small")
qa_tokenizer = T5Tokenizer.from_pretrained("t5-small")

def generate_answer(question, context):
    """Generate an answer using the T5 model based on retrieved context."""
    input_text = f"question: {question} context: {context}"
    input_ids = qa_tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = qa_model.generate(input_ids, max_length=50)
    return qa_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Test the QA model with a sample question
retrieved_context = " ".join(retrieve_relevant_text(sample_question))
answer = generate_answer(sample_question, retrieved_context)
print(f"Q: {sample_question}\nA: {answer}")


Q: What is the main theme of the book?
A: readership


In [14]:
questions = [
    "Who is the main character?",
    "What happens in the first chapter?",
    "What is the moral of the story?",
    "What did the author emphasize?",
]

for q in questions:
    context = " ".join(retrieve_relevant_text(q))
    answer = generate_answer(q, context)
    print(f"Q: {q}\nA: {answer}\n")


Q: Who is the main character?
A: St. Jacques

Q: What happens in the first chapter?
A: offer reinforcing information of value

Q: What is the moral of the story?
A: not one effort

Q: What did the author emphasize?
A: They were able to use the book to start productive conversations.”



In [15]:
qa_model.save_pretrained("qa_model")
qa_tokenizer.save_pretrained("qa_model")
print("Model saved successfully!")


Model saved successfully!
