In [2]:
#Primero importamos algunas librerias: 

!pip install ipywidgets

import os
from pathlib import Path
import feedparser
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import math
import json
from datetime import datetime



In [6]:
#Seguimos... 

# Optional ML/embedding imports - these must be installed
try:
 from sentence_transformers import SentenceTransformer
except Exception as e:
 SentenceTransformer = None
try:
 import tiktoken
except Exception:
 tiktoken = None
try:
 import chromadb
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
except Exception:
 chromadb = None
# LangChain
try:
 from langchain.schema import Document
 from langchain.embeddings import SentenceTransformerEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.chains import SimpleSequentialChain, LLMChain
 from langchain.prompts import PromptTemplate
 from langchain.llms import OpenAI
except Exception:
# We'll still provide the functions; user should install langchain to use them.
 pass

In [7]:
#Configuración inicial: 
RPP_RSS_URL = "https://rpp.pe/rss"
MAX_ITEMS = 50
PERSIST_DIRECTORY = "./chroma_rpp_db"
CHROMA_COLLECTION_NAME = "rpp_news"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

In [18]:
import requests
import feedparser
import pandas as pd
from datetime import datetime

def fetch_rss_items(rss_url: str, max_items: int = 50):
    try:
        # ⚙️ Usa requests para obtener el XML ignorando SSL
        response = requests.get(rss_url, verify=False, timeout=10)
        response.raise_for_status()  # error si el request falla
        feed = feedparser.parse(response.text)
    except Exception as e:
        print(f"Error al descargar o parsear feed: {e}")
        return pd.DataFrame()

    entries = feed.get("entries", [])[:max_items]
    records = []
    for e in entries:
        title = e.get("title", "")
        description = e.get("description", "")
        link = e.get("link", "")
        published = e.get("published", "")
        try:
            published_parsed = e.get("published_parsed")
            if published_parsed:
                published = datetime(*published_parsed[:6]).isoformat()
        except Exception:
            pass
        records.append({
            "title": title,
            "description": description,
            "link": link,
            "date_published": published,
        })
    return pd.DataFrame(records)


In [19]:
RPP_RSS_URL = "https://rpp.pe/rss"
df = fetch_rss_items(RPP_RSS_URL, 50)
print(f"Fetched {len(df)} items")
df.head(5)





Fetched 50 items


Unnamed: 0,title,description,link,date_published
0,Cúal fue el último temblor en México hoy 22 de...,Cuál es el ultimo temblor en México y CDMX reg...,https://rpp.pe/mundo/mexico/cual-fue-el-ultimo...,2025-10-22T11:42:35
1,"Temblor en Perú, hoy 22 de octubre: magnitud y...",Actualización EN VIVO del último sismo en Perú...,https://rpp.pe/lima/desastres-naturales/temblo...,2025-10-22T07:18:58
2,Pleno del Congreso otorga el voto de confianza...,"Con 79 votos a favor, 15 en contra y 5 abstenc...",https://rpp.pe/politica/congreso/ernesto-alvar...,2025-10-23T01:28:10
3,Tabla de posiciones de la Champions League 202...,Conoce los resultados y la tabla de posiciones...,https://rpp.pe/futbol/champions-league/tabla-d...,2025-10-23T01:55:07
4,EE.UU. sanciona a las dos principales petroler...,Las empresas Rosneft y Lukoil están especializ...,https://rpp.pe/mundo/actualidad/eeuu-sanciona-...,2025-10-23T01:50:40


In [20]:
import math

# TOKENIZATION: Tokenize a sample article using tiktoken
def count_tokens_tiktoken(text: str, model_name: str = "gpt-4o-mini") -> int:
    """
    Use tiktoken if available. Provide a fallback token estimation (approx 4 chars/token).
    """
    if tiktoken is None:
        # fallback estimate: 1 token ≈ 4 characters (very rough)
        return max(1, math.ceil(len(text) / 4))
    try:
        enc = tiktoken.encoding_for_model(model_name)
    except Exception:
        try:
            enc = tiktoken.get_encoding("cl100k_base")
        except Exception:
            return max(1, math.ceil(len(text) / 4))
    return len(enc.encode(text))


In [22]:
# Build sample text from the first RSS item
sample_text = (df.loc[0, "title"] or "") + "\n\n" + (df.loc[0, "description"] or "")

num_tokens = count_tokens_tiktoken(sample_text)
print("Sample tokens:", num_tokens)

Sample tokens: 67


In [23]:
# Decide if chunking is needed
MODEL_TOKEN_LIMIT = 4096
CHUNK_TOKEN_TARGET = 1000  # target tokens per chunk for embeddings/search
needs_chunking = num_tokens > CHUNK_TOKEN_TARGET
print("Needs chunking?", needs_chunking)

Needs chunking? False


In [24]:
# Chunking helper (naive words-based — can replace with tiktoken-based chunker)
def chunk_text(text: str, chunk_size_chars: int = 2000):
    """Naive chunk by characters preserving whole words."""
    words = text.split()
    chunks = []
    cur = []
    cur_len = 0
    for w in words:
        if cur_len + len(w) + 1 > chunk_size_chars:
            chunks.append(" ".join(cur))
            cur = [w]
            cur_len = len(w) + 1
        else:
            cur.append(w)
            cur_len += len(w) + 1
    if cur:
        chunks.append(" ".join(cur))
    return chunks


# Example chunk preview
if needs_chunking:
    chunks = chunk_text(sample_text, chunk_size_chars=1500)
    print("Chunks created:", len(chunks))
    for i, c in enumerate(chunks[:2]):
        print(i, "len chars:", len(c))

In [26]:
# %%
import numpy as np

# EMBEDDING: Use SentenceTransformers
def load_sentence_transformer(model_name: str = EMBEDDING_MODEL_NAME):
    """
    Load a SentenceTransformer model, or raise an error if not installed.
    """
    if SentenceTransformer is None:
        raise ImportError("sentence-transformers not installed. Install from requirements.txt")
    model = SentenceTransformer(model_name)
    return model


In [27]:
# Wrapper class for embeddings
class EmbeddingModel:
    def __init__(self, model_name=EMBEDDING_MODEL_NAME):
        self.model_name = model_name
        self.model = None

    def load(self):
        self.model = load_sentence_transformer(self.model_name)

    def embed_texts(self, texts: list) -> np.ndarray:
        if self.model is None:
            self.load()
        # Returns numpy array of embeddings
        return np.array(self.model.encode(texts, show_progress_bar=True))



In [28]:
# Prepare documents to embed: combine title + description, chunk if needed
docs = []
for idx, row in df.iterrows():
    text = (row["title"] or "") + "\n\n" + (row["description"] or "")
    # If long, chunk
    if count_tokens_tiktoken(text) > CHUNK_TOKEN_TARGET:
        text_chunks = chunk_text(text, chunk_size_chars=1500)
        for ci, c in enumerate(text_chunks):
            docs.append({
                "id": f"{idx}_chunk{ci}",
                "text": c,
                "metadata": {
                    "orig_index": int(idx),
                    "chunk_id": ci,
                    "title": row["title"],
                    "link": row["link"],
                    "date_published": row["date_published"],
                },
            })
    else:
        docs.append({
            "id": f"{idx}",
            "text": text,
            "metadata": {
                "orig_index": int(idx),
                "chunk_id": 0,
                "title": row["title"],
                "link": row["link"],
                "date_published": row["date_published"],
            },
        })

print("Total docs to embed:", len(docs))
print("Example doc:", docs[0])


Total docs to embed: 50
Example doc: {'id': '0', 'text': 'Cúal fue el último temblor en México hoy 22 de octubre según SSN\n\nCuál es el ultimo temblor en México y CDMX registrado segun el Servicio Sismológico Nacional (SSN) hoy 22 de octubre del 2025. Consulta los últimos sismos EN VIVO para México aquí.', 'metadata': {'orig_index': 0, 'chunk_id': 0, 'title': 'Cúal fue el último temblor en México hoy 22 de octubre según SSN', 'link': 'https://rpp.pe/mundo/mexico/cual-fue-el-ultimo-temblor-en-mexico-hoy-22-de-octubre-segun-ssn-live-3013', 'date_published': '2025-10-22T11:42:35'}}


In [29]:
# %%
# Compute embeddings (this may take a while the first time)
emb = EmbeddingModel()

# Uncomment below lines to actually run embeddings
# emb.load()
# embeddings = emb.embed_texts([d["text"] for d in docs])

# For demo/testing (no heavy model download):
embeddings = np.random.randn(len(docs), 384).astype(np.float32)

print("Embeddings shape:", embeddings.shape)

Embeddings shape: (50, 384)
