In [2]:
!pip install ir_datasets --quiet

import numpy as np
import pandas as pd
import re
from collections import defaultdict
import ir_datasets
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import heapq

print("Cargando dataset BEIR / cqadupstack / webmasters ...")

ds = ir_datasets.load("beir/cqadupstack/webmasters")

# Cargar documentos
docs = [
    {"doc_id": d.doc_id, "text": d.text, "title": d.title}
    for d in ds.docs_iter()
]

# Cargar consultas
queries = [
    {"query_id": q.query_id, "text": q.text}
    for q in ds.queries_iter()
]

# Relevancia (qrels)
qrels = [
    {"query_id": r.query_id, "doc_id": r.doc_id, "relevance": r.relevance}
    for r in ds.qrels_iter()
]

docs_df = pd.DataFrame(docs)
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

print("Total documentos:", len(docs_df))
docs_df.head()


Cargando dataset BEIR / cqadupstack / webmasters ...
Total documentos: 17405


Unnamed: 0,doc_id,text,title
0,35236,I'm making a website for a small hotel in php....,Hotel Reservation Request Booking Paypal PHP
1,35540,A site I just became in charge of uses a reall...,How to backup a dev & QA folder website struct...
2,35230,I want to start doing affiliate marketing on a...,"As an affiliate, how do you know if a sale is ..."
3,35548,I need to remove a URL from _Health -> Fetch a...,"Remove URL from ""Fetch as Google"" - in Google ..."
4,35238,I have a website where a previous developer ha...,How can I redirect all files in a directory th...


## PREPROCESAMIENTO

In [3]:

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess(text):
    """
    Preprocesamiento básico:
    - Minúsculas
    - Eliminación de caracteres no alfabéticos
    - Tokenización por espacios
    - Eliminación de stopwords
    - Stemming (Porter)
    """
    if not isinstance(text, str):
        return []

    # Normalizar
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Tokenizar
    tokens = text.split()

    # Remover stopwords + stemming
    processed = [stemmer.stem(t) for t in tokens if t not in stop_words]

    return processed

docs_df["tokens"] = docs_df["text"].apply(preprocess)

docs_df["doc_len"] = docs_df["tokens"].apply(len)

print("Ejemplo de documento procesado:")
display(docs_df[["doc_id", "tokens"]].head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Ejemplo de documento procesado:


Unnamed: 0,doc_id,tokens
0,35236,"[make, websit, small, hotel, php, hotel, owner..."
1,35540,"[site, becam, charg, use, realli, simpl, two, ..."
2,35230,"[want, start, affili, market, blog, someon, wa..."
3,35548,"[need, remov, url, health, fetch, googl, googl..."
4,35238,"[websit, previou, develop, updat, sever, webpa..."


## ÍNDICE INVERTIDO

In [4]:
inverted = defaultdict(dict)

for _, row in docs_df.iterrows():
    tf = defaultdict(int)
    for t in row.tokens:
        tf[t] += 1

    for term, freq in tf.items():
        inverted[term][row.doc_id] = freq

df_term = {t: len(docs) for t, docs in inverted.items()}

print("Índice invertido listo. Términos indexados:", len(inverted))


Índice invertido listo. Términos indexados: 25244


## JACCARD

In [5]:
N = len(docs_df)
L_avg = docs_df["doc_len"].mean()

def preprocess_query(q):
    return preprocess(q)

def rank_jaccard(query, k=10):
    q_tokens = set(preprocess_query(query))
    scores = {}
    
    for _, row in docs_df.iterrows():
        d = set(row.tokens)
        inter = len(q_tokens & d)
        uni = len(q_tokens | d)
        scores[row.doc_id] = inter / uni if uni else 0
    
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]


## TF-IDF + SIMILITUD COSENO 

In [6]:
# PrecalculO vectores TF-IDF de documentos
idf_cache = {t: np.log((N - df + 0.5) / (df + 0.5)) for t, df in df_term.items()}

doc_tfidf = {}
doc_norm = {}

for _, row in docs_df.iterrows():
    vec = {}
    for t in row.tokens:
        tf = inverted[t].get(row.doc_id, 0)
        vec[t] = tf * idf_cache.get(t, 0)
    
    norm = np.sqrt(sum(v*v for v in vec.values()))
    doc_tfidf[row.doc_id] = vec
    doc_norm[row.doc_id] = norm


def rank_tfidf(query, k=10):
    q_tokens = preprocess_query(query)
    
    q_vec = defaultdict(float)
    for t in q_tokens:
        q_vec[t] += idf_cache.get(t, 0)
    
    q_norm = np.sqrt(sum(v*v for v in q_vec.values()))
    
    scores = defaultdict(float)
    
    for t, wq in q_vec.items():
        if t not in inverted:
            continue
        for doc, tf in inverted[t].items():
            wd = doc_tfidf[doc].get(t, 0)
            scores[doc] += wq * wd
    
    for doc in list(scores.keys()):
        if q_norm > 0 and doc_norm[doc] > 0:
            scores[doc] /= (q_norm * doc_norm[doc])
    
    return heapq.nlargest(k, scores.items(), key=lambda x: x[1])


## BM25

In [8]:
def idf(term):
    df = df_term.get(term, 0)
    return np.log((N - df + 0.5) / (df + 0.5))

def rank_bm25(query, k=10, k1=1.2, b=0.75):
    q_tokens = preprocess_query(query)
    scores = defaultdict(float)
    
    doc_len_map = docs_df.set_index("doc_id")["doc_len"].to_dict()
    
    for term in set(q_tokens):
        if term not in inverted:
            continue
        
        posting = inverted[term]
        idf_val = idf(term)
        
        for doc, tf in posting.items():
            dl = doc_len_map[doc]
            denom = tf + k1*(1 - b + b*(dl / L_avg))
            scores[doc] += idf_val * (tf*(k1+1) / denom)
    
    return heapq.nlargest(k, scores.items(), key=lambda x: x[1])


## INTERFAZ DE CONSULTA

In [9]:
from ipywidgets import widgets
from IPython.display import display, clear_output

box = widgets.Text(
    description="Consulta:",
    placeholder="Escribe tu consulta aquí...",
)

out = widgets.Output()

def ejecutar(change):
    if change["type"] == "change" and change["name"] == "value":
        query = change["new"]
        if query.strip() == "":
            return
        
        with out:
            clear_output()
            print("\n=== Resultados para:", query, "===\n")

            print("\n--- Jaccard ---")
            for doc, score in rank_jaccard(query, k=5):
                title = docs_df.loc[docs_df.doc_id == doc, "title"].values[0]
                print(f"{doc} | {score:.4f} | {title[:60]}...")

            print("\n--- TF-IDF Coseno ---")
            for doc, score in rank_tfidf(query, k=5):
                title = docs_df.loc[docs_df.doc_id == doc, "title"].values[0]
                print(f"{doc} | {score:.4f} | {title[:60]}...")

            print("\n--- BM25 ---")
            for doc, score in rank_bm25(query, k=5):
                title = docs_df.loc[docs_df.doc_id == doc, "title"].values[0]
                print(f"{doc} | {score:.4f} | {title[:60]}...")


box.observe(ejecutar, names="value")

display(box, out)


Text(value='', description='Consulta:', placeholder='Escribe tu consulta aquí...')

Output()

## EVALUACIÓN COMPLETA (PRECISION, RECALL, AP, MAP)

In [10]:
def get_relevant(qid):
    return set(qrels_df[qrels_df.query_id == qid].doc_id)

def precision_at_k(ranked, rel, k):
    retrieved = [d for d, _ in ranked[:k]]
    return len(set(retrieved) & rel) / k

def recall_at_k(ranked, rel, k):
    if len(rel) == 0:
        return 1
    retrieved = [d for d, _ in ranked[:k]]
    return len(set(retrieved) & rel) / len(rel)

def average_precision(ranked, rel):
    hits = 0
    score = 0
    for i, (doc, _) in enumerate(ranked):
        if doc in rel:
            hits += 1
            score += hits / (i + 1)
    return score / len(rel) if rel else 0


subset_queries = queries_df.sample(min(50, len(queries_df)), random_state=42)

def evaluate_with_metrics(model, k=10):
    rows = []
    
    for _, row in subset_queries.iterrows():
        ranked = model(row.text, k=1000)
        rel = get_relevant(row.query_id)
        
        P = precision_at_k(ranked, rel, k)
        R = recall_at_k(ranked, rel, k)
        AP = average_precision(ranked, rel)
        
        rows.append({
            "query_id": row.query_id,
            "precision@10": P,
            "recall@10": R,
            "AP": AP
        })
    
    df_eval = pd.DataFrame(rows)
    MAP = df_eval["AP"].mean()
    
    return df_eval, MAP

print("\n======================")
print("  RESULTADOS GENERALES")
print("======================\n")

# --- JACCARD ---
df_jac, map_j = evaluate_with_metrics(rank_jaccard)
print("Modelo: Jaccard")
print("Precision@10 promedio:", df_jac["precision@10"].mean())
print("Recall@10 promedio   :", df_jac["recall@10"].mean())
print("Average Precision    :", df_jac["AP"].mean())
print("➡ MAP Jaccard        :", map_j)
print("\n---------------------------------------\n")

# --- TF-IDF COSENO ---
df_tf, map_t = evaluate_with_metrics(rank_tfidf)
print("Modelo: TF-IDF Coseno")
print("Precision@10 promedio:", df_tf["precision@10"].mean())
print("Recall@10 promedio   :", df_tf["recall@10"].mean())
print("Average Precision    :", df_tf["AP"].mean())
print("➡ MAP TF-IDF        :", map_t)
print("\n---------------------------------------\n")

# --- BM25 ---
df_bm, map_b = evaluate_with_metrics(rank_bm25)
print("Modelo: BM25")
print("Precision@10 promedio:", df_bm["precision@10"].mean())
print("Recall@10 promedio   :", df_bm["recall@10"].mean())
print("Average Precision    :", df_bm["AP"].mean())
print("➡ MAP BM25          :", map_b)
print("\n=======================================\n")


print("✔ Evaluación completa finalizada.")



  RESULTADOS GENERALES

Modelo: Jaccard
Precision@10 promedio: 0.026000000000000006
Recall@10 promedio   : 0.19240000000000002
Average Precision    : 0.1021209898384973
➡ MAP Jaccard        : 0.1021209898384973

---------------------------------------

Modelo: TF-IDF Coseno
Precision@10 promedio: 0.044000000000000004
Recall@10 promedio   : 0.33399999999999996
Average Precision    : 0.16934593018534005
➡ MAP TF-IDF        : 0.16934593018534005

---------------------------------------

Modelo: BM25
Precision@10 promedio: 0.05000000000000001
Recall@10 promedio   : 0.3648
Average Precision    : 0.249565466545183
➡ MAP BM25          : 0.249565466545183


✔ Evaluación completa finalizada.
