In [30]:
!pip install faiss-cpu
!pip install pdfplumber
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)




In [31]:
import os
import faiss
import numpy as np
import torch
from transformers import pipeline, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import pdfplumber


In [32]:
# Step 1: Load and Preprocess the Book
def load_book(filepath):
    """Reads a text or PDF file and returns its content as a string."""
    if filepath.endswith(".txt"):
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()
    elif filepath.endswith(".pdf"):
        text = ""
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                # Get the MediaBox and crop the text based on it
                mediabox = page.mediabox
                text += page.crop(mediabox).extract_text() + "\n"  # Added crop section
        return text
    else:
        raise ValueError("Unsupported file format. Use .txt or .pdf")

In [33]:

# Step 2: Split Text into Chunks
def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for retrieval."""
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

In [34]:

# Step 3: Convert Text Chunks into Embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def create_embeddings(chunks):
    return embedding_model.encode(chunks, convert_to_numpy=True)

In [35]:
# Step 4: Store Embeddings in FAISS Index
def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


In [36]:
# Step 5: Retrieve Relevant Chunks
def retrieve_passage(question, chunks, index, top_k=3):
    query_vector = embedding_model.encode([question])
    distances, indices = index.search(query_vector, top_k)
    return " ".join([chunks[i] for i in indices[0]])


In [37]:
# Step 6: Load a Question Answering Model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question, context):
    return qa_pipeline(question=question, context=context)["answer"]


Device set to use cuda:0


In [38]:
# Step 7: Run the Model
if __name__ == "__main__":
    book_text = load_book(r"/content/Dr._Kalam-profile.pdf")
    chunks = split_text(book_text)
    embeddings = create_embeddings(chunks)
    faiss_index = build_faiss_index(embeddings)

    question = "who is kalam.?"
    retrieved_context = retrieve_passage(question, chunks, faiss_index)
    answer = answer_question(question, retrieved_context)

    print(f"Q: {question}\nA: {answer}")

Q: who is kalam.?
A: one of the most distinguished scientists of India


In [39]:
import faiss
import pickle

# Save the FAISS index
faiss.write_index(faiss_index, "faiss_index.bin")

# Save the chunks (as you'll need them later)
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)


In [40]:
embedding_model.save("embedding_model")
