In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


# Seccion 1 Preprocesamiento de Datos

In [1]:
!pip install faiss-cpu sentence-transformers nltk

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


2026-01-28 16:43:39.641413: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769618619.845752      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769618619.901665      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769618620.382667      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769618620.382703      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769618620.382706      55 computation_placer.cc:177] computation placer alr

✅ Entorno configurado correctamente (usando FAISS CPU).


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
import json
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):

    if not text: return ""
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    clean_tokens = [
        ps.stem(token) for token in tokens 
        if token not in stop_words and token not in string.punctuation
    ]
    return " ".join(clean_tokens)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Seccion 2 Representación mediante Embeddings

In [26]:
from sentence_transformers import SentenceTransformer

DATA_PATH = "/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"
LIMIT = 10000

doc_ids = []
processed_docs = [] 
metadata_map = {}

with open(DATA_PATH, 'r') as f:
    for i, line in enumerate(f):
        if i >= LIMIT: break
        paper = json.loads(line)
        
        full_text = f"{paper['title']}. {paper['abstract']}"
        
        clean_text = preprocess_text(full_text)
        
        doc_ids.append(paper['id'])
        processed_docs.append(clean_text)
        
        metadata_map[paper['id']] = {
            'title': paper['title'], 
            'abstract': paper['abstract'],
            'categories': paper['categories']
        }

model_bi = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model_bi.encode(processed_docs, convert_to_numpy=True, show_progress_bar=True)

print(f"proceso correcto")

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

proceso correcto


# Seccion 3 Recuperación Inicial (First-Stage Retrieval)

In [28]:
import faiss

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(doc_embeddings)

print(f"Se ha cargado {index.ntotal} vectores.")

def search_faiss(query_text, k=20):

    q_clean = preprocess_text(query_text)
    q_vec = model_bi.encode([q_clean], convert_to_numpy=True)
    
    distances, indices = index.search(q_vec, k)
    
    results = []
    for idx, score in zip(indices[0], distances[0]):
        if idx < 0: continue
        
        real_id = doc_ids[idx]
        meta = metadata_map[real_id]
        
        results.append({
            'doc_id': real_id,
            'score': float(score),
            'title': meta['title'],
            'abstract': meta['abstract'],
            'categories': meta['categories']
        })
    return results

print("proceso correcto")

Se ha cargado 10000 vectores.
proceso correcto


# Seccion 4 Re-ranking de Resultados

In [29]:
from sentence_transformers import CrossEncoder

model_cross = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_results(query, initial_hits):
    if not initial_hits: return []
    pairs = [[query, f"{hit['title']} {hit['abstract']}"] for hit in initial_hits]
    scores = model_cross.predict(pairs)
    reranked = []
    for i, hit in enumerate(initial_hits):
        hit_copy = hit.copy()
        hit_copy['rerank_score'] = scores[i]
        reranked.append(hit_copy)
    reranked.sort(key=lambda x: x['rerank_score'], reverse=True)
    
    return reranked

print("proceso correcto")

proceso correcto


# Seccion 5 Simulación de Consultas

In [39]:
consultas_demo = [
    "International conflict", 
    "Economic policy news", 
    "Natural disasters"
]

for query in consultas_demo:
    print(f"\n se busca: '{query}'")

    hits_base = search_faiss(query, k=5)
    
    hits_rerank = rerank_results(query, hits_base)
    
    print(f"{'ID':<12} | {'Score FAISS':<12} | {'ID':<12} | {'Score Rerank':<12} | {'Título'}")
    print("-" * 140)
    
    for i in range(len(hits_base)):
        id_b = hits_base[i]['doc_id']
        sc_b = f"{hits_base[i]['score']:.4f}"
        id_r = hits_rerank[i]['doc_id']
        sc_r = f"{hits_rerank[i]['rerank_score']:.4f}"
        title = hits_rerank[i]['title'][:50] + "..."
        
        print(f"{id_b:<12} | {sc_b:<12} | {id_r:<12} | {sc_r:<12} | {title}")


 se busca: 'International conflict'
ID           | Score FAISS  | ID           | Score Rerank | Título
--------------------------------------------------------------------------------------------------------------------------------------------
0705.1209    | 0.3235       | 0705.1209    | -2.0762      | Artificial Intelligence for Conflict Management...
0706.0100    | 0.2609       | 0705.1761    | -5.8097      | Modeling and Controlling Interstate Conflict...
0705.1761    | 0.2499       | 0706.0100    | -10.8544     | Evolutionary Dilemmas in a Social Network...
0705.0403    | 0.2437       | 0705.0233    | -11.2018     | Coordination for a Group of Autonomous Mobile Agen...
0705.0233    | 0.2437       | 0705.0403    | -11.2313     | Tracking control for multi-agent consensus with an...

 se busca: 'Economic policy news'
ID           | Score FAISS  | ID           | Score Rerank | Título
-----------------------------------------------------------------------------------------------------

# Seccion 6 Evaluación del Sistema

In [33]:
test_set = [
    {"q": "Neural networks image classification", "cat": "cs"},       
    {"q": "Higgs boson particle physics", "cat": "hep-ph"},           
    {"q": "Superconductivity materials", "cat": "cond-mat"},          
    {"q": "Black hole general relativity", "cat": "gr-qc"}           
]

def calculate_metrics(results, target_cat, k=10):
    top_k = results[:k]
    hits = 0
    for doc in top_k:
        if target_cat in doc['categories']:
            hits += 1
            
    precision = hits / k
    recall = hits / min(k, 10) 
    return precision, recall

results_log = []

for item in test_set:
    q = item['q']
    target = item['cat']

    base = search_faiss(q, k=20)
    final = rerank_results(q, base)

    p_base, r_base = calculate_metrics(base, target, k=10)
    p_final, r_final = calculate_metrics(final, target, k=10)
    
    results_log.append({
        'Consulta': q,
        'P@10 (Base)': p_base,
        'P@10 (Final)': p_final,
        'Mejora': p_final - p_base
    })

df_metrics = pd.DataFrame(results_log)
print(df_metrics)
print("\nPromedios:")
print(df_metrics.mean(numeric_only=True))

                               Consulta  P@10 (Base)  P@10 (Final)  Mejora
0  Neural networks image classification          0.4           0.5     0.1
1          Higgs boson particle physics          0.9           0.9     0.0
2           Superconductivity materials          0.9           0.9     0.0
3         Black hole general relativity          0.8           0.8     0.0

Promedios:
P@10 (Base)     0.750
P@10 (Final)    0.775
Mejora          0.025
dtype: float64


# Seccion 7 Análisis de Resultados

## Análisis
* El modelo vectorialcon FAISS logro recuperar los documentos que tienen relacion entre si rapidamente.
Como se puede ver en la tabla evaluación, la metrica de la Precision@10.

* Con la implementacion de re-ranking hubo un cambio significativo en la precisión.
* Un dato a tener en cuenta es que el model cross encoder en la mayoria de veces sube documentos que tienen en su contenido la respuesta exacta o tiene un contexto específico. Por ello deja de lado aquellos que solo comparten palabras clave pero tienen una diferente contexto. Por ende por mas que mejore la calidad, incrementa el tiempo por consulta.

# Comparacion 

* Hay una mejora en la consulta "Neural networks image classification", donde la precisión subió de 0.4 a 0.5 (+10%).
* Se nota que el faiss recuperó documentos generales, pero el cross encoder, al analizar el contexto completo. 
* En la consulta Black hole, Higgs boson, la precisión se mantuvo estable (0.8 - 0.9) sin cambios tras el re-ranking, lo que significa que lo que se recupero ya era optimo. 

# Stiven Saldaña