In [7]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

pdf_path =  r"D:\GitHub\Projetos\Mestrado\EnergyContext\undp-2000.pdf"
pdf_text = extract_text_from_pdf(pdf_path)


In [8]:
import re

def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = split_text_into_chunks(pdf_text)


In [9]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Load a tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Convert text chunks to embeddings
def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Create a FAISS index
dimension = 768  # Dimension of BERT embeddings
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
for chunk in chunks:
    embedding = embed_text(chunk)
    index.add(embedding)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
from transformers import pipeline

qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")


In [16]:
def retrieve_relevant_chunks(question, k=5):
    question_embedding = embed_text(question)
    _, indices = index.search(question_embedding, k)
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks

def answer_question(question):
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)
    answer = qa_model(question=question, context=context)
    return answer['answer']

# Example usage
question = "What is Agenda 21 ?"
answer = answer_question(question)
print(f"Answer: {answer}")


Answer: The Programme for the Further Implementation
