In [None]:
pip install PyMuPDF

In [None]:
#Load the PDF file
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    doc.close()
    return full_text

pdf_path = "ind_rev.pdf"
text = extract_text_from_pdf(pdf_path)
print(text[:1000])


In [None]:
#Create chunks of text
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

chunks = splitter.split_text(text)
print(f"Total chunks: {len(chunks)}")
print(chunks[0])

In [None]:
!pip install faiss-cpu

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")  #embedding model

In [None]:
embeddings = model.encode(chunks)

In [None]:
dimension = embeddings.shape[1] # "all-MiniLM-L6-v2" model dimension -> (chunk_size,384)
index = faiss.IndexFlatL2(dimension) # L2 = Euclidean distance
index.add(np.array(embeddings))

# Q-A

In [None]:
query = "What invention did James Watt create?"

query_embedding = model.encode([query])

In [None]:
top_k = 3
D, I = index.search(np.array(query_embedding).astype("float32"), top_k) # D -> distance I-> chunk indexes

relevant_chunks = [chunks[i] for i in I[0]]

In [None]:
from transformers import pipeline

In [None]:
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [None]:
answers = []
for chunk in relevant_chunks:
    result = qa(question=query, context=chunk)
    answers.append((result['answer'], result['score']))

In [None]:
best_answer = sorted(answers, key=lambda x: x[1], reverse=True)[0]
print("Answer:", best_answer[0])