In [4]:
import re
import networkx as nx
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize models for sentence and word filtering
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
word_model_name = 'bert-base-uncased'  # Choose a smaller BERT model for word filtering
word_tokenizer = AutoTokenizer.from_pretrained(word_model_name)
word_model = AutoModel.from_pretrained(word_model_name)

def load_book(file_path):
    """
    Load the book from a .txt file and extract text by page.
    """
    book = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        pages = content.split('--- Página ')
        for page in pages:
            if page.strip():
                match = re.match(r'(\d+)', page)
                if match:
                    page_number = int(match.group(1))
                    page_text = page[len(match.group(0)):].strip()
                    book[page_number] = page_text
    return book

def sentence_level_filtering(sentences, question):
    """
    Filters sentences based on their relevance to the question using PageRank.
    """
    sentence_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
    question_embedding = sentence_model.encode(question, convert_to_tensor=True)
    
    # Compute pairwise similarity between sentences and the question
    similarities = util.pytorch_cos_sim(sentence_embeddings, question_embedding).squeeze(1).tolist()
    
    # Construct a graph where nodes are sentences and edges are similarities
    G = nx.Graph()
    for i, sim in enumerate(similarities):
        G.add_node(i, weight=sim)
        for j in range(i + 1, len(sentences)):
            sim_ij = util.pytorch_cos_sim(sentence_embeddings[i], sentence_embeddings[j]).item()
            if sim_ij > 0.3:
                G.add_edge(i, j, weight=sim_ij)
    
    # Apply PageRank
    pagerank_scores = nx.pagerank(G, weight='weight')
    
    # Rank sentences by PageRank scores
    ranked_sentences = sorted([(i, pagerank_scores[i]) for i in pagerank_scores], key=lambda x: x[1], reverse=True)
    return [sentences[i] for i, _ in ranked_sentences]

def word_level_filtering(sentence, question):
    """
    Filters words within a sentence based on their relevance to the question using attention and PageRank.
    """
    inputs = word_tokenizer(sentence, question, return_tensors='pt', truncation=True)
    outputs = word_model(**inputs, output_attentions=True)
    
    # Extract attention weights
    attention = torch.mean(torch.stack(outputs.attentions), dim=1)  # Average across layers
    
    # Compute PageRank on word graph
    num_words = attention.size(-1)
    adj_matrix = attention[0, :, :, :].mean(dim=0).detach().numpy()
    G = nx.DiGraph()
    for i in range(num_words):
        for j in range(num_words):
            if adj_matrix[i, j] > 0.01:
                G.add_edge(i, j, weight=adj_matrix[i, j])
    pagerank_scores = nx.pagerank(G, weight='weight')
    
    token_ids = inputs['input_ids'][0].numpy()
    tokens = word_tokenizer.convert_ids_to_tokens(token_ids)
    ranked_words = sorted([(tokens[i], pagerank_scores[i]) for i in range(len(tokens))], key=lambda x: x[1], reverse=True)
    return [word for word, score in ranked_words if word not in ['[CLS]', '[SEP]']]

def find_best_pages(question, book, top_n=3):
    """
    Matches the question to the most relevant book pages using sentence and word filtering.
    """
    page_scores = []
    
    for page_number, page_text in book.items():
        # Step 1: Split page into sentences
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', page_text)
        
        # Step 2: Apply sentence-level filtering
        relevant_sentences = sentence_level_filtering(sentences, question)
        
        # Step 3: Apply word-level filtering
        total_score = 0
        for sentence in relevant_sentences[:3]:  # Limit to top 3 sentences per page
            filtered_words = word_level_filtering(sentence, question)
            total_score += sum([len(word) for word in filtered_words])  # Simple scoring
        
        page_scores.append((page_number, total_score))
    
    # Rank pages by their total score
    ranked_pages = sorted(page_scores, key=lambda x: x[1], reverse=True)
    return ranked_pages[:top_n]

# Example Usage
file_path = 'transcricao_livro_ajustada.txt'
book = load_book(file_path)

question = "What are the principles of the Liberal State and how does it differ from absolutism?"
top_pages = find_best_pages(question, book, top_n=5)

print("Most relevant pages for the question:")
for page_number, score in top_pages:
    print(f"Page {page_number}: Score {score}")


Most relevant pages for the question:
Page 169: Score 1991
Page 120: Score 1855
Page 172: Score 1723
Page 82: Score 1634
Page 165: Score 1632
