In [1]:
import re
import networkx as nx
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
import torch
from nltk.corpus import stopwords, wordnet


pt_stp_words = stopwords.words('portuguese')

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in pt_stp_words])

# Initialize models for sentence and word filtering
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # paraphrase-multilingual-MiniLM-L12-v2
word_model_name = 'bert-base-uncased'  # Choose a smaller BERT model for word filtering
word_tokenizer = AutoTokenizer.from_pretrained(word_model_name)
word_model = AutoModel.from_pretrained(word_model_name)

def load_book(file_path):
    """
    Load the book from a .txt file and extract text by page.
    """
    book = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        pages = content.split('--- Página ')
        for page in pages:
            if page.strip():
                match = re.match(r'(\d+)', page)
                if match:
                    page_number = int(match.group(1))
                    page_text = page[len(match.group(0)):].strip()
                    book[page_number] = page_text
    return book

def extract_keywords(question):
    """
    Extracts main keywords from the question using synonyms and related terms.
    """
    question_keywords = set(question.lower().split())
    synonyms = set()
    for word in question_keywords:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower())
    return question_keywords.union(synonyms)

def sentence_level_filtering(sentences, question):
    """
    Filters sentences based on their relevance to the question using cosine similarity.
    """
    sentence_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
    question_embedding = sentence_model.encode(question, convert_to_tensor=True)
    
    # Compute relevance scores using cosine similarity
    relevance_scores = util.pytorch_cos_sim(sentence_embeddings, question_embedding).squeeze(1).tolist()
    
    # Dynamic thresholding based on variability in similarity scores
    threshold = max(0.3, 0.5 * (max(relevance_scores) - min(relevance_scores)))
    
    # Rank sentences by relevance
    ranked_sentences = sorted(
        [(sentences[i], relevance_scores[i]) for i in range(len(sentences)) if relevance_scores[i] >= threshold],
        key=lambda x: x[1],
        reverse=True
    )
    return ranked_sentences

def word_level_filtering(sentence, question):
    """
    Filters words within a sentence based on their relevance to the question using attention and PageRank.
    """
    inputs = word_tokenizer(sentence, question, return_tensors='pt', truncation=True)
    outputs = word_model(**inputs, output_attentions=True)
    
    # Extract attention weights
    attention = torch.mean(torch.stack(outputs.attentions), dim=1)  # Average across layers
    
    # Compute PageRank on word graph
    num_words = attention.size(-1)
    adj_matrix = attention[0, :, :, :].mean(dim=0).detach().numpy()
    G = nx.DiGraph()
    for i in range(num_words):
        for j in range(num_words):
            if adj_matrix[i, j] > 0.01:
                G.add_edge(i, j, weight=adj_matrix[i, j])
    pagerank_scores = nx.pagerank(G, weight='weight')
    
    token_ids = inputs['input_ids'][0].numpy()
    tokens = word_tokenizer.convert_ids_to_tokens(token_ids)
    ranked_words = sorted(
        [(tokens[i], pagerank_scores[i]) for i in range(len(tokens)) if tokens[i] not in ['[CLS]', '[SEP]']],
        key=lambda x: x[1],
        reverse=True
    )
    return ranked_words  # Returns tuples of (word, score)

def find_best_pages(question, book, top_n=3):
    """
    Matches the question to the most relevant book pages using enhanced filtering and scoring.
    """
    question_keywords = extract_keywords(question)  # Extract main keywords
    page_scores = []

    for page_number, page_text in book.items():
        # Step 1: Split page into sentences
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', page_text)
        
        # Step 2: Apply sentence-level filtering
        ranked_sentences = sentence_level_filtering(sentences, question)
        top_sentences = ranked_sentences[:3]  # Limit to top 3 sentences for relevance
        
        # Step 3: Apply word-level filtering
        total_score = 0
        for sentence, sent_score in top_sentences:
            ranked_words = word_level_filtering(sentence, question)
            keyword_overlap = sum(1 for word, _ in ranked_words if word in question_keywords)
            word_score = sum(score for _, score in ranked_words[:5]) + keyword_overlap  # Boost with keyword overlap
            total_score += sent_score * word_score  # Combine sentence and word scores
        
        # Normalize by the number of sentences to prevent bias
        normalized_score = total_score / len(sentences) if sentences else 0
        page_scores.append((page_number, normalized_score))
    
    # Rank pages by their total score
    ranked_pages = sorted(page_scores, key=lambda x: x[1], reverse=True)
    return ranked_pages[:top_n]



# Example Usage
file_path = 'transcricao_livro_ajustada.txt'
book = load_book(file_path)

question = 'A palavra “feudalismo” carrega consigo vários sentidos. Dentre eles, podem-se apontar aqueles ligados a: sociedades marcadas por dependências mútuas e assimétricas entre senhores e vassalos. relações de parentesco determinadas pelo local de nascimento, sobretudo quando urbano. regimes inteiramente dominados pela fé religiosa, seja ela cristã ou muçulmana. altas concentrações fundiárias e capitalistas. formas de economias de subsistência pré-agrícolas.'
question = remove_stopwords(question)

top_pages = find_best_pages(question, book, top_n=5)

print("Most relevant pages for the question:")
for page_number, score in top_pages:
    print(f"Page {page_number}: Score {score}")




Most relevant pages for the question:
Page 71: Score 0.6033105656189951
Page 120: Score 0.4943276142988639
Page 135: Score 0.4682799600187311
Page 92: Score 0.4641684977996952
Page 145: Score 0.41812834728695436
