In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.corpus import stopwords

pt_stp_words = stopwords.words('portuguese')

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in pt_stp_words])

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')  # More QA-focused model

nltk.download('punkt')  # Ensure NLTK tokenizers are downloaded

def load_book(file_path):
    """
    Load the book from a .txt file and extract text by page.
    """
    book = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        pages = content.split('--- Página ')
        for page in pages:
            if page.strip():
                match = re.match(r'(\d+)', page)
                if match:
                    page_number = int(match.group(1))
                    page_text = page[len(match.group(0)):].strip()
                    book[page_number] = page_text
    return book

def decompose_query(query):
    """
    Decompose a query into meaningful subqueries using sentence tokenization.
    """
    return sent_tokenize(query)

def decompose_page(page_text):
    """
    Decompose a page into propositions (sentences) using NLTK sentence tokenizer.
    """
    return sent_tokenize(page_text)

def extract_keywords(text, top_k=5):
    """
    Extract top keywords from a given text using CountVectorizer.
    """
    vectorizer = CountVectorizer(max_features=top_k, stop_words='english')
    word_counts = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def calculate_similarity(query, page_propositions):
    """
    Compute similarity scores for sq-d, sq-p, and ss-p.
    """
    query_embedding = model.encode(query, convert_to_tensor=True)
    proposition_embeddings = model.encode(page_propositions, convert_to_tensor=True)

    # sq-d: Query-to-Document Similarity
    sq_d = util.cos_sim(query_embedding, proposition_embeddings).mean().item()

    # sq-p: Query-to-Proposition Similarity
    sq_p = util.cos_sim(query_embedding, proposition_embeddings).max().item()

    # ss-p: Subquery-to-Proposition Similarity
    subqueries = decompose_query(query)
    subquery_embeddings = model.encode(subqueries, convert_to_tensor=True)
    ss_p = 0
    for subquery_emb in subquery_embeddings:
        max_sim = util.cos_sim(subquery_emb, proposition_embeddings).max().item()
        ss_p += max_sim
    ss_p /= len(subquery_embeddings)

    return sq_d, sq_p, ss_p

def reciprocal_rank_fusion(scores, weights=(1, 2, 1)):
    """
    Fuse scores using a weighted Reciprocal Rank Fusion (RRF).
    """
    weighted_scores = [weight * score for weight, score in zip(weights, scores)]
    return sum(weighted_scores)

def find_best_pages(query, book, top_n=3):
    """
    Match a query to the most relevant book pages using refined multi-granularity approach.
    """
    page_scores = []
    query_keywords = extract_keywords(query)

    for page_number, page_text in book.items():
        # Decompose the page into propositions
        propositions = decompose_page(page_text)

        # Calculate similarities
        sq_d, sq_p, ss_p = calculate_similarity(query, propositions)

        # Extract keywords from the page and calculate overlap
        page_keywords = extract_keywords(page_text)
        keyword_overlap = len(query_keywords & page_keywords) / len(query_keywords | page_keywords)

        # Normalize scores to [0, 1]
        sq_d = np.exp(sq_d) / (1 + np.exp(sq_d))
        sq_p = np.exp(sq_p) / (1 + np.exp(sq_p))
        ss_p = np.exp(ss_p) / (1 + np.exp(ss_p))

        # Debugging: Print intermediate results
        print(f"Page {page_number} - sq-d: {sq_d}, sq-p: {sq_p}, ss-p: {ss_p}, keyword_overlap: {keyword_overlap}")

        # Fuse scores with weighted RRF and keyword overlap
        fused_score = reciprocal_rank_fusion([sq_d, sq_p, ss_p], weights=(1, 2, 1)) + keyword_overlap
        page_scores.append((page_number, fused_score))

    # Rank pages by fused scores
    ranked_pages = sorted(page_scores, key=lambda x: x[1], reverse=True)
    return ranked_pages[:top_n]

# Example Usage
file_path = 'transcricao_livro_ajustada.txt'
book = load_book(file_path)

query = 'A palavra “feudalismo” carrega consigo vários sentidos. Dentre eles, podem-se apontar aqueles ligados a: sociedades marcadas por dependências mútuas e assimétricas entre senhores e vassalos. relações de parentesco determinadas pelo local de nascimento, sobretudo quando urbano. regimes inteiramente dominados pela fé religiosa, seja ela cristã ou muçulmana. altas concentrações fundiárias e capitalistas. formas de economias de subsistência pré-agrícolas.'
query = remove_stopwords(query)
top_pages = find_best_pages(query, book, top_n=5)

print("Most relevant pages for the query:")
for page_number, score in top_pages:
    print(f"Page {page_number}: Score {score}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Page 12 - sq-d: 0.5925489258934812, sq-p: 0.6125627291925344, ss-p: 0.6374835794655385, keyword_overlap: 0.0
Page 13 - sq-d: 0.597673983113273, sq-p: 0.6283038218893771, ss-p: 0.6541920012232846, keyword_overlap: 0.0
Page 14 - sq-d: 0.6069786706988811, sq-p: 0.6310909807623433, ss-p: 0.6449521980069758, keyword_overlap: 0.0
Page 15 - sq-d: 0.6137585547955834, sq-p: 0.6538719347253266, ss-p: 0.6488541757854587, keyword_overlap: 0.0
Page 16 - sq-d: 0.6059624167598885, sq-p: 0.6467310381929153, ss-p: 0.6437649143993152, keyword_overlap: 0.0
Page 17 - sq-d: 0.6006709835955982, sq-p: 0.6246993652272416, ss-p: 0.6493226194482314, keyword_overlap: 0.0
Page 18 - sq-d: 0.6051396760798324, sq-p: 0.635089248921791, ss-p: 0.6389852942895381, keyword_overlap: 0.0
Page 19 - sq-d: 0.6102132637385004, sq-p: 0.6370031750233175, ss-p: 0.64282773145487, keyword_overlap: 0.0
Page 20 - sq-d: 0.6021573914499254, sq-p: 0.6259306427412362, ss-p: 0.643562522599198, keyword_overlap: 0.0
Page 21 - sq-d: 0.605119

# The Cell Bellow Is The Same As The One Above But With A Differnt Question

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.corpus import stopwords

pt_stp_words = stopwords.words('portuguese')

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in pt_stp_words])

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')  # More QA-focused model

nltk.download('punkt')  # Ensure NLTK tokenizers are downloaded

def load_book(file_path):
    """
    Load the book from a .txt file and extract text by page.
    """
    book = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        pages = content.split('--- Página ')
        for page in pages:
            if page.strip():
                match = re.match(r'(\d+)', page)
                if match:
                    page_number = int(match.group(1))
                    page_text = page[len(match.group(0)):].strip()
                    book[page_number] = page_text
    return book

def decompose_query(query):
    """
    Decompose a query into meaningful subqueries using sentence tokenization.
    """
    return sent_tokenize(query)

def decompose_page(page_text):
    """
    Decompose a page into propositions (sentences) using NLTK sentence tokenizer.
    """
    return sent_tokenize(page_text)

def extract_keywords(text, top_k=5):
    """
    Extract top keywords from a given text using CountVectorizer.
    """
    vectorizer = CountVectorizer(max_features=top_k, stop_words='english')
    word_counts = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def calculate_similarity(query, page_propositions):
    """
    Compute similarity scores for sq-d, sq-p, and ss-p.
    """
    query_embedding = model.encode(query, convert_to_tensor=True)
    proposition_embeddings = model.encode(page_propositions, convert_to_tensor=True)

    # sq-d: Query-to-Document Similarity
    sq_d = util.cos_sim(query_embedding, proposition_embeddings).mean().item()

    # sq-p: Query-to-Proposition Similarity
    sq_p = util.cos_sim(query_embedding, proposition_embeddings).max().item()

    # ss-p: Subquery-to-Proposition Similarity
    subqueries = decompose_query(query)
    subquery_embeddings = model.encode(subqueries, convert_to_tensor=True)
    ss_p = 0
    for subquery_emb in subquery_embeddings:
        max_sim = util.cos_sim(subquery_emb, proposition_embeddings).max().item()
        ss_p += max_sim
    ss_p /= len(subquery_embeddings)

    return sq_d, sq_p, ss_p

def reciprocal_rank_fusion(scores, weights=(1, 2, 1)):
    """
    Fuse scores using a weighted Reciprocal Rank Fusion (RRF).
    """
    weighted_scores = [weight * score for weight, score in zip(weights, scores)]
    return sum(weighted_scores)

def find_best_pages(query, book, top_n=3):
    """
    Match a query to the most relevant book pages using refined multi-granularity approach.
    """
    page_scores = []
    query_keywords = extract_keywords(query)

    for page_number, page_text in book.items():
        # Decompose the page into propositions
        propositions = decompose_page(page_text)

        # Calculate similarities
        sq_d, sq_p, ss_p = calculate_similarity(query, propositions)

        # Extract keywords from the page and calculate overlap
        page_keywords = extract_keywords(page_text)
        keyword_overlap = len(query_keywords & page_keywords) / len(query_keywords | page_keywords)

        # Normalize scores to [0, 1]
        sq_d = np.exp(sq_d) / (1 + np.exp(sq_d))
        sq_p = np.exp(sq_p) / (1 + np.exp(sq_p))
        ss_p = np.exp(ss_p) / (1 + np.exp(ss_p))

        # Debugging: Print intermediate results
        print(f"Page {page_number} - sq-d: {sq_d}, sq-p: {sq_p}, ss-p: {ss_p}, keyword_overlap: {keyword_overlap}")

        # Fuse scores with weighted RRF and keyword overlap
        fused_score = reciprocal_rank_fusion([sq_d, sq_p, ss_p], weights=(1, 2, 1)) + keyword_overlap
        page_scores.append((page_number, fused_score))

    # Rank pages by fused scores
    ranked_pages = sorted(page_scores, key=lambda x: x[1], reverse=True)
    return ranked_pages[:top_n]

# Example Usage
file_path = 'transcricao_livro_ajustada.txt'
book = load_book(file_path)

query = 'Nos séculos XIV-XV, a sociedade feudal experimentou uma grave crise geral, que abalou profundamente as estruturas que sustentavam essa sociedade, abrindo espaços para a criação de relações capitalistas no interior das sociedades européias. Os efeitos da depressão dos séculos XIV-XV sobre a sociedade européia foram os seguintes, EXCETO: a expansão marítima dos séculos XV e XVI, rompendo os estreitos limites do comércio medieval. a centralização do poder nas mãos do rei, em contrapartida ao poder pulverizado dos senhores feudais. o surgimento de uma nova cultura mais urbana e laica, em oposição à rural-religiosa do feudalismo. a busca de urna nova espiritualidade, possibilitando a ruptura da unidade cristã através da Reforma. a ocupação do poder político pela burguesia, sustentada no crescente enriquecimento dessa classe.'
query = remove_stopwords(query)
top_pages = find_best_pages(query, book, top_n=5)

print("Most relevant pages for the query:")
for page_number, score in top_pages:
    print(f"Page {page_number}: Score {score}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Page 12 - sq-d: 0.5980846200505161, sq-p: 0.6204083960225842, ss-p: 0.6363611336379156, keyword_overlap: 0.0
Page 13 - sq-d: 0.5965231553216958, sq-p: 0.6197823984636393, ss-p: 0.6463839775065109, keyword_overlap: 0.0
Page 14 - sq-d: 0.6070097601518561, sq-p: 0.6200145087088129, ss-p: 0.6487203557458491, keyword_overlap: 0.0
Page 15 - sq-d: 0.6083061533752605, sq-p: 0.6376766991326837, ss-p: 0.6527171366065594, keyword_overlap: 0.0
Page 16 - sq-d: 0.6057143983116766, sq-p: 0.6257966981382217, ss-p: 0.6478366572267826, keyword_overlap: 0.0
Page 17 - sq-d: 0.5987676612617993, sq-p: 0.6361991556319244, ss-p: 0.6494423299102542, keyword_overlap: 0.0
Page 18 - sq-d: 0.6068863996636434, sq-p: 0.6295941046219222, ss-p: 0.6455549169917117, keyword_overlap: 0.0
Page 19 - sq-d: 0.61086540630061, sq-p: 0.6384815650023327, ss-p: 0.6453843598566036, keyword_overlap: 0.0
Page 20 - sq-d: 0.604603249902359, sq-p: 0.6294538692564109, ss-p: 0.6473391832474081, keyword_overlap: 0.0
Page 21 - sq-d: 0.6056