## PUNTO # 4 GENSIM

Este punto tiene como objetivo replicar las funcionalidades del punto 104
anterior utilizando la librería Gensim, una herramienta especializada 105
en el procesamiento de textos y modelado de tópicos.

Librerias a Utilizar

In [25]:
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation
from gensim import corpora
from gensim import models
from gensim import similarities
import pandas as pd
import os
import re
import gensim
import numpy as np


### Rutas que apuntan a la Data

Las siguientes rutas deberán ser modificadas para acceder a los archivos de datos

In [13]:
path_docs = './data/docs-raw-texts' #path de los documentos
path_queries = './data/queries-raw-texts'
ground_truth_path = './data/relevance-judgments.tsv'  #path de lectura del ground-truth
salidaFile = "GESIM-consultas_resultados.tsv"  #path para exportación de resultados GENSIM

### Extracción del texto
Para extraer los datos hacemos uso de una expresión regular, al igual que hemos hecho en los anteriores puntos. Así traeremos el texto relevante para nuestro corpus.

In [3]:
def extraer_texto_relevante(file_content):

    """
    Funcion que extrae el texto que se utilizara de cada documento
    """

    match = re.search(r'<raw><!\[CDATA\[(.*?)\]\]></raw>', file_content, re.DOTALL)

    if match:
        return match.group(1).lower()
    return ""

### Creación del Corpus

Tokenización del Corpus del documento al igual que el preprocesamiento de texto por defecto de GENSIM.

In [4]:
def tokenizar_documentos(path_dir):

    """
    Funcion que crea un corpus que podamos utlizar con la librerpia GENSIM
    """

    # Lista para almacenar los documentos tokenizados
    textos = []
    
    for doc in os.listdir(path_dir):
        path = os.path.join(path_dir, doc)
        with open(path, encoding='utf8') as f:
            file_content = f.read()

            # Extraer el texto relevante
            texto = extraer_texto_relevante(file_content)
          
            # Tokenizar y agregar a la lista de textos
            if texto:
                textos.append(preprocess_string(texto))
               
                
    
    return textos



In [5]:
corpus = tokenizar_documentos(path_docs)

Creamos el Vocabulario con base en nuestro corpus y gracias a la pfunción de Gensim

In [6]:
def crear_vocabulario(corpus):

    vocabulario = corpora.Dictionary(corpus)
    return vocabulario

In [7]:
vocabulario = crear_vocabulario(corpus)
vocabulario.save('vocabulario_gensim.dict')
vocabulario.token2id

{'accid': 0,
 'acid': 1,
 'activ': 2,
 'affect': 3,
 'ag': 4,
 'alexi': 5,
 'american': 6,
 'armi': 7,
 'basi': 8,
 'beaumont': 9,
 'beaumont’': 10,
 'best': 11,
 'better': 12,
 'bit': 13,
 'book': 14,
 'born': 15,
 'break': 16,
 'broken': 17,
 'canadian': 18,
 'caus': 19,
 'chemic': 20,
 'children': 21,
 'close': 22,
 'compani': 23,
 'complet': 24,
 'connecticut': 25,
 'consid': 26,
 'di': 27,
 'differ': 28,
 'digest': 29,
 'digestion”': 30,
 'discov': 31,
 'earli': 32,
 'emot': 33,
 'examin': 34,
 'exist': 35,
 'expect': 36,
 'experi': 37,
 'famou': 38,
 'find': 39,
 'fistula': 40,
 'follow': 41,
 'food': 42,
 'fort': 43,
 'french': 44,
 'fur': 45,
 'fuse': 46,
 'gain': 47,
 'gastric': 48,
 'gave': 49,
 'heal': 50,
 'hole': 51,
 'human': 52,
 'hydrochlor': 53,
 'imag': 54,
 'import': 55,
 'inform': 56,
 'insert': 57,
 'insight': 58,
 'interest': 59,
 'island': 60,
 'juic': 61,
 'june': 62,
 'knowledg': 63,
 'known': 64,
 'leav': 65,
 'lebanon': 66,
 'lectur': 67,
 'mackinac': 68,
 'm

### Creación Vectorial ponderada ti_idf

Aca crearemos un iterador que convierte documentos de texto a su representación de bolsa de palabras (Bag of Words, BOW).

Al iterar sobre instancias de esta clase, cada documento en el corpus se convierte en un vector BOW utilizando
un vocabulario predefinido. Esta conversión es esencial para modelos que requieren entradas numéricas como TF-IDF.

In [8]:
class Bow_Corpus:
    def __iter__(self):
        for doc in corpus:
            # Vectorización del texto por documento
            yield vocabulario.doc2bow(doc)

bow_corpus = Bow_Corpus()


for doc in bow_corpus:
    print(doc)



# Entrenamiento del modelo TF-IDF
tfidf = models.TfidfModel(bow_corpus, normalize=True) # normalize=True normaliza los valores en el rango [0, 1]
print(tfidf)

# Aplicar el modelo TF-IDF al corpus
tfidf_corpus = tfidf[bow_corpus]


# Imnprimer los vectores TF-IDF
for doc in tfidf_corpus:
    print(doc)

#Guardamos el corpus en un archivo
tfidf.save('simple_corpus_GENSIM.tfidf')

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 3), (7, 2), (8, 1), (9, 11), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 3), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 10), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 2), (44, 1), (45, 3), (46, 1), (47, 1), (48, 5), (49, 1), (50, 1), (51, 2), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 2), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 2), (71, 3), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 2), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 2), (99, 2), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1

### Similitud Coseno

Calcula la similitud coseno entre dos documentos en un corpus TF-IDF utilizando Gensim.

In [9]:
def simil_coseno_gensim(doc_index1, doc_index2, tfidf_model, corpus):
    
    # Crea el índice de similitudes si aún no existe
    tfidf_corpus = list(tfidf[bow_corpus])   
    index = similarities.MatrixSimilarity(tfidf_corpus)

    # Obtener la similitud coseno usando el índice
    similitudes = index[tfidf_corpus[doc_index1]]
    coseno = similitudes[doc_index2]
    
    return coseno


#### EJEMPLO
doc_index1 = 0  
doc_index2 = 1  

# Calcula la similitud coseno entre dos documentos específicos
coseno_simil = simil_coseno_gensim(doc_index1, doc_index2, tfidf, tfidf_corpus)
print(f"Similitud coseno entre los documentos {doc_index1} y {doc_index2}: {coseno_simil}")


Similitud coseno entre los documentos 0 y 1: 0.004058283753693104


In [17]:
### Procesamiento de Queries

queries = tokenizar_documentos(path_queries)

vocabulario = corpora.Dictionary(corpus + queries)  # Unificar vocabulario

print(corpus)
print(queries)
print(vocabulario)

# Generación de corpus en formato BOW y TF-IDF
bow_corpus = [vocabulario.doc2bow(doc) for doc in corpus]
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]

# Índice de similitudes
index = similarities.MatrixSimilarity(tfidf_corpus)


[['william', 'beaumont', 'human', 'digest', 'william', 'beaumont', 'physiolog', 'digest', 'imag', 'sourc', 'novemb', 'american', 'surgeon', 'william', 'beaumont', 'born', 'best', 'known', '“father', 'gastric', 'physiology”', 'follow', 'research', 'human', 'digest', 'william', 'beaumont', 'born', 'lebanon', 'connecticut', 'physician', 'serv', 'surgeon’', 'mate', 'armi', 'war', 'open', 'privat', 'practic', 'plattsburgh', 'new', 'york', 'rejoin', 'armi', 'surgeon', 'beaumont', 'station', 'fort', 'mackinac', 'mackinac', 'island', 'michigan', 'earli', 'exist', 'protect', 'interest', 'american', 'fur', 'compani', 'fort', 'refug', 'wound', 'year', 'old', 'french', 'canadian', 'fur', 'trader', 'name', 'alexi', 'martin', 'shotgun', 'went', 'accid', 'american', 'fur', 'compani', 'store', 'close', 'rang', 'june', 'martin’', 'wound', 'stomach', 'perfor', 'rib', 'broken', 'expect', 'young', 'man', 'surviv', 'skin', 'martin’', 'wound', 'fuse', 'hole', 'stomach', 'leav', 'perman', 'open', 'gastric', 

Se procesa una lista de consultas para determinar y registrar la similitud de cada consulta con documentos en un índice, utilizando un modelo TF-IDF y un vocabulario predefinido.

In [29]:
def procesar_consultas_y_documentos(queries, index, vocabulario, tfidf_model, salida_path):

    data = []  # Lista para almacenar datos para el DataFrame

    with open(salida_path, 'w') as salida_file:
        for i, query in enumerate(queries):

            query_id = f"q{i + 1:02}"
            query_bow = vocabulario.doc2bow(query)
            query_tfidf = tfidf_model[query_bow]
            similitudes = index[query_tfidf]

            # Filtrar y ordenar resultados
            resultados = []
            for idx, sim in enumerate(similitudes):
                if sim > 0:
                    resultados.append(("d{:02}".format(idx + 1), sim))
            
            # Ordenar por similitud
            resultados.sort(key=lambda x: x[1], reverse=True)

            # Formato de salida
            resultados_str = ''
            for doc_id, sim in resultados:
                resultados_str += "{}: {:.4f},".format(doc_id, sim)
            resultados_str = resultados_str.rstrip(',')  # Remover la última coma

            salida_file.write("q{:02} {}\n".format(i + 1, resultados_str))

            # Almacenar en la lista para DataFrame
            data.append({'query_id': query_id, 'docs': resultados_str})

    # Crear DataFrame
    resultados_df = pd.DataFrame(data)
    return resultados_df

resultados_df = procesar_consultas_y_documentos(queries, index, vocabulario, tfidf_model, salidaFile) # Llamada a la función y guardar el DataFrame

### # PROCESAMIENTO INICIAL DEL GROUND-TRUTH

In [30]:

ground_truth_df = pd.read_csv(ground_truth_path, sep='\t', header=None, names=['query_id', 'doc_id_relevance'])
ground_truth_df['doc_ids'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')])
ground_truth_df['relevance'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [int(doc.split(':')[1]) for doc in x.split(',')])
ground_truth_df[['query_id', 'doc_ids', 'relevance']]

Unnamed: 0,query_id,doc_ids,relevance
0,q01,"[d186, d254, d016]","[4, 5, 5]"
1,q02,"[d136, d139, d143, d283, d228, d164, d318, d29...","[2, 2, 4, 4, 4, 4, 2, 4, 4, 2, 2]"
2,q03,"[d152, d291, d283, d147, d318, d105]","[3, 4, 4, 3, 2, 2]"
3,q04,"[d275, d010, d286, d019, d049, d330, d270]","[3, 3, 2, 2, 2, 2, 3]"
4,q06,"[d069, d233, d257, d297, d026, d329]","[2, 3, 2, 3, 4, 5]"
5,q07,"[d004, d077, d266, d179]","[3, 3, 2, 3]"
6,q08,"[d205, d005, d110, d108, d117, d081, d292, d25...","[2, 4, 4, 3, 3, 2, 2, 5, 3, 3, 2, 2]"
7,q09,"[d205, d199, d198, d223, d217, d177]","[3, 5, 3, 2, 2, 2]"
8,q10,"[d068, d100, d065, d076, d231, d199, d052, d215]","[2, 2, 3, 3, 4, 4, 2, 2]"
9,q12,"[d239, d277, d258, d250]","[4, 4, 3, 4]"


### Calculo de P@M

In [21]:


def limpiar_identificadores(doc_ids):
    return list(set([doc.replace(')', '').strip() for doc in doc_ids]))

def calcular_precision_p_m(resultados_df, ground_truth_df):
    precision_por_query = {}
    if 'doc_ids' not in resultados_df.columns:
        resultados_df['doc_ids'] = resultados_df['docs'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')])
    resultados_df['doc_ids'] = resultados_df['doc_ids'].apply(limpiar_identificadores)   
    if 'doc_ids' not in ground_truth_df.columns:
        ground_truth_df['doc_ids'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')])
    for _, row in resultados_df.iterrows():
        query_id = row['query_id']
        doc_retrieved = row['doc_ids']   
        M = len(doc_retrieved)     
        ground_truth_row = ground_truth_df[ground_truth_df['query_id'] == query_id]   
        if ground_truth_row.empty:
            precision_por_query[query_id] = 0.0
            continue
        relevancia_docs = ground_truth_row['doc_ids'].values[0]
        relevance_list = [1 if doc in relevancia_docs else 0 for doc in doc_retrieved[:M]]
        precision_p_m = sum(relevance_list) / M if M > 0 else 0.0
        precision_por_query[query_id] = precision_p_m
            
    return precision_por_query

precision_resultados = calcular_precision_p_m(resultados_df, ground_truth_df)
precisionDF = pd.DataFrame(list(precision_resultados.items()), columns=['query_id', 'precision'])
precisionDF

Unnamed: 0,query_id,precision
0,q01,0.032258
1,q02,0.055838
2,q03,1.0
3,q04,0.017621
4,q05,0.0
5,q06,0.0
6,q07,0.006969
7,q08,0.027027
8,q09,0.030769
9,q10,0.0


### Calculo de R@M

In [23]:



def calcular_recall_p_m(resultados_df, ground_truth_df):
    recall_por_query = {}
    
    # Asegurarse de que los identificadores de documentos están disponibles para comparación
    if 'doc_ids' not in resultados_df.columns:
        resultados_df['doc_ids'] = resultados_df['docs'].apply(
            lambda x: [doc.split(':')[0] for doc in x.split(',')]
        )
    if 'doc_ids' not in ground_truth_df.columns:
        ground_truth_df['doc_ids'] = ground_truth_df['doc_id_relevance'].apply(
            lambda x: [doc.split(':')[0] for doc in x.split(',')]
        )
        ground_truth_df['relevancias'] = ground_truth_df['doc_id_relevance'].apply(
            lambda x: [int(doc.split(':')[1]) for doc in x.split(',')]
        )

    # Iterar sobre cada fila en resultados_df para calcular recall
    for _, row in resultados_df.iterrows():
        query_id = row['query_id']
        doc_retrieved = row['doc_ids']
        # Encontrar la fila correspondiente en ground_truth_df
        ground_truth_row = ground_truth_df[ground_truth_df['query_id'] == query_id]

        # Comprobar si hay datos relevantes para esta consulta
        if ground_truth_row.empty or not ground_truth_row['doc_ids'].values[0]:
            recall_por_query[query_id] = 0.0
            continue

        # Lista de documentos relevantes
        relevancia_docs = ground_truth_row['doc_ids'].values[0]
        num_total_relevant_docs = len(relevancia_docs)
        # Crear una lista de relevancia binaria para los documentos recuperados
        relevance_list = [1 if doc in relevancia_docs else 0 for doc in doc_retrieved]
        
        # Calcular recall como la suma de los relevantes recuperados dividida por el total de relevantes
        recall_p_m = sum(relevance_list) / num_total_relevant_docs if num_total_relevant_docs > 0 else 0
        recall_por_query[query_id] = recall_p_m
    
    return recall_por_query

# Suponiendo que resultados_df y ground_truth_df ya están definidos
recall_resultados = calcular_recall_p_m(resultados_df, ground_truth_df)
recallDF = pd.DataFrame(list(recall_resultados.items()), columns=['query_id', 'recall'])
print(recallDF)



   query_id    recall
0       q01  0.666667
1       q02  1.000000
2       q03  1.000000
3       q04  0.571429
4       q05  0.000000
5       q06  0.000000
6       q07  0.500000
7       q08  0.166667
8       q09  1.000000
9       q10  0.000000
10      q11  0.000000
11      q12  0.250000
12      q13  0.400000
13      q14  0.083333
14      q15  0.000000
15      q16  0.000000
16      q17  0.500000
17      q18  0.571429
18      q19  0.000000
19      q20  0.000000
20      q21  0.000000
21      q22  0.142857
22      q23  0.750000
23      q24  0.000000
24      q25  0.000000
25      q26  0.000000
26      q27  0.500000
27      q28  0.000000
28      q29  0.333333
29      q30  0.000000
30      q31  0.000000
31      q32  0.000000
32      q33  0.000000
33      q34  0.000000
34      q35  0.000000


### Calculo del NDCG@M

In [26]:


def calcular_dcg(relevancias, M):
    return sum((relevancia / np.log2(idx + 2)) for idx, relevancia in enumerate(relevancias[:M]))

def calcular_ndcg_p_m(resultados_df, ground_truth_df):
    ndcg_por_query = {}
    if 'doc_ids' not in resultados_df.columns:
        resultados_df['doc_ids'] = resultados_df['docs'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')])
    resultados_df['doc_ids'] = resultados_df['doc_ids'].apply(limpiar_identificadores)
    if 'doc_ids' not in ground_truth_df.columns:
        ground_truth_df['doc_ids'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')]) 
    ground_truth_df['relevancias'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [int(doc.split(':')[1]) for doc in x.split(',')])
    for _, row in resultados_df.iterrows():
        query_id = row['query_id']
        doc_retrieved = row['doc_ids'] 
        ground_truth_row = ground_truth_df[ground_truth_df['query_id'] == query_id]
        if ground_truth_row.empty:
            ndcg_por_query[query_id] = 0.0
            continue       
        relevancia_docs = ground_truth_row['doc_ids'].values[0]
        relevancias_reales = ground_truth_row['relevancias'].values[0]

        M = len(relevancia_docs)      
        relevancias_obtenidas = [relevancias_reales[relevancia_docs.index(doc)] if doc in relevancia_docs else 0 for doc in doc_retrieved[:M]]
        dcg = calcular_dcg(relevancias_obtenidas, M)
        relevancias_ideales = sorted(relevancias_reales, reverse=True)
        idcg = calcular_dcg(relevancias_ideales, M)
        ndcg_p_m = dcg / idcg if idcg > 0 else 0.0
        ndcg_por_query[query_id] = ndcg_p_m
    
    return ndcg_por_query

ndcg_resultados = calcular_ndcg_p_m(resultados_df, ground_truth_df)
ndcgDF = pd.DataFrame(list(ndcg_resultados.items()), columns=['query_id', 'ndcg'])
ndcgDF

Unnamed: 0,query_id,ndcg
0,q01,0.0
1,q02,0.0
2,q03,0.881896
3,q04,0.0
4,q05,0.0
5,q06,0.0
6,q07,0.0
7,q08,0.330419
8,q09,0.0
9,q10,0.0


### Calculo de la metrica de evaluación 𝑁𝐷𝐶𝐺@𝑀

In [28]:
def calcular_dcg(relevancias, M):
    return sum((relevancia / np.log2(idx + 2)) for idx, relevancia in enumerate(relevancias[:M]))

def calcular_ndcg_p_m(resultados_df, ground_truth_df):
    ndcg_por_query = {}
    
    for _, row in resultados_df.iterrows():
        query_id = row['query_id']
        doc_retrieved = row['doc_ids'] 
        ground_truth_row = ground_truth_df[ground_truth_df['query_id'] == query_id]

        if ground_truth_row.empty or 'doc_ids' not in ground_truth_row.columns or 'relevancias' not in ground_truth_row.columns:
            ndcg_por_query[query_id] = 0.0
            continue

        relevancia_docs = ground_truth_row['doc_ids'].values[0]
        relevancias_reales = ground_truth_row['relevancias'].values[0]
        M = min(len(doc_retrieved), len(relevancia_docs))
        
        relevancias_obtenidas = [relevancias_reales[relevancia_docs.index(doc)] if doc in relevancia_docs else 0 for doc in doc_retrieved[:M]]
        dcg = calcular_dcg(relevancias_obtenidas, M)
        relevancias_ideales = sorted(relevancias_reales, reverse=True)
        idcg = calcular_dcg(relevancias_ideales, M)
        ndcg_p_m = dcg / idcg if idcg > 0 else 0.0
        ndcg_por_query[query_id] = ndcg_p_m
    
    return ndcg_por_query

# Asegúrate de que resultados_df y ground_truth_df están correctamente preparados
ndcg_resultados = calcular_ndcg_p_m(resultados_df, ground_truth_df)
ndcgDF = pd.DataFrame(list(ndcg_resultados.items()), columns=['query_id', 'ndcg'])
print(ndcgDF)


   query_id      ndcg
0       q01  0.000000
1       q02  0.000000
2       q03  0.881896
3       q04  0.000000
4       q05  0.000000
5       q06  0.000000
6       q07  0.000000
7       q08  0.330419
8       q09  0.000000
9       q10  0.000000
10      q11  0.000000
11      q12  0.000000
12      q13  0.000000
13      q14  0.000000
14      q15  0.000000
15      q16  0.000000
16      q17  0.000000
17      q18  0.000000
18      q19  0.000000
19      q20  0.000000
20      q21  0.000000
21      q22  0.000000
22      q23  0.000000
23      q24  0.000000
24      q25  0.000000
25      q26  0.000000
26      q27  0.000000
27      q28  0.000000
28      q29  0.000000
29      q30  0.000000
30      q31  0.000000
31      q32  0.000000
32      q33  0.000000
33      q34  0.000000
34      q35  0.000000


# Calculo de la metrica de evaluación 𝑀𝐴𝑃

In [31]:
def calcular_precision_acumulada(relevance_list):
    precisiones = []
    num_relevant = 0
    for i, relevancia in enumerate(relevance_list):
        if relevancia == 1:
            num_relevant += 1
            precisiones.append(num_relevant / (i + 1))  
    if precisiones:
        return sum(precisiones) / len(precisiones)
    else:
        return 0.0

def calcular_map(resultados_df, ground_truth_df):
    average_precisions = []
    if 'doc_ids' not in resultados_df.columns:
        resultados_df['doc_ids'] = resultados_df['docs'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')]) 
    resultados_df['doc_ids'] = resultados_df['doc_ids'].apply(limpiar_identificadores)
    if 'doc_ids' not in ground_truth_df.columns:
        ground_truth_df['doc_ids'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [doc.split(':')[0] for doc in x.split(',')])
    ground_truth_df['relevancias'] = ground_truth_df['doc_id_relevance'].apply(lambda x: [int(doc.split(':')[1]) for doc in x.split(',')])
    for _, row in resultados_df.iterrows():
        query_id = row['query_id']
        doc_retrieved = row['doc_ids']  
        ground_truth_row = ground_truth_df[ground_truth_df['query_id'] == query_id]
        if ground_truth_row.empty:
            average_precisions.append(0.0)
            continue
        relevancia_docs = ground_truth_row['doc_ids'].values[0]
        relevance_list = [1 if doc in relevancia_docs else 0 for doc in doc_retrieved]
        avg_precision = calcular_precision_acumulada(relevance_list)
        average_precisions.append(avg_precision)
    map_score = sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
    return map_score

# Ejemplo de uso
map_score = calcular_map(resultados_df, ground_truth_df)
print("\nMAP (Mean Average Precision):", map_score)


MAP (Mean Average Precision): 0.058783838384159064
