In [12]:
import os
import faiss
import pickle
import pdfplumber
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Ensure FAISS index folder exists
FAISS_PATH = "faiss_index"
if not os.path.exists(FAISS_PATH):
    os.makedirs(FAISS_PATH)

In [None]:
# Load PDF and extract text
def load_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

In [None]:

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Process book PDFs and store in FAISS
def process_pdf(pdf_path):
    text = load_pdf(pdf_path)
    chunks = text_splitter.split_text(text)

    # 🔴 Remove empty chunks
    valid_chunks = [chunk for chunk in chunks if chunk.strip()]
    if not valid_chunks:
        print(f"ERROR: No valid text found in {pdf_path}")
        return

    # Generate embeddings
    embeddings = model.encode(valid_chunks).astype(np.float32)

    # Initialize FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    # Save FAISS index and chunk metadata
    faiss.write_index(index, os.path.join(FAISS_PATH, "book_index.faiss"))

    with open(os.path.join(FAISS_PATH, "metadata.pkl"), "wb") as f:
        pickle.dump(valid_chunks, f)

    print(f"✅ Stored {len(valid_chunks)} book embeddings in FAISS.")

In [None]:
# Run pipeline
if __name__ == "__main__":
    process_pdf("booklist.pdf")  # Change this to your actual book file

✅ Stored 329 book embeddings in FAISS.
