In [3]:
#Primero descargamos algunas dependencias necesarias:
!pip install -q wikipedia-api sentence-transformers chromadb langchain langchain-community langchain-chroma transformers torch pandas


In [4]:
#Hacemos unas dependencias:
import wikipediaapi
import csv
from pathlib import Path

# Wikipedia API (con User-Agent válido):
wiki = wikipediaapi.Wikipedia(
    user_agent="JoaquinResearchBot/1.0 (mailto:joaquin@example.com)",
    language="en"
)

# Fetch page:
page = wiki.page('Federated_learning')
text = page.text
print(f'Fetched page: {page.title} — length {len(text)} characters')


# Función de chunking:
def chunk_text(text, words_per_chunk=300):
    words = text.split()
    chunks = []
    for i in range(0, len(words), words_per_chunk):
        chunk = " ".join(words[i:i + words_per_chunk])
        chunks.append(chunk)
    return chunks


chunks = chunk_text(text, 300)

# Guardamos los chunks en un csv:
out_path = Path('/mnt/data/wiki_corpus.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)

with out_path.open('w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['id', 'title', 'text'])
    writer.writeheader()
    for idx, chunk in enumerate(chunks):
        writer.writerow({
            'id': f'fed_{idx}',
            'title': page.title,
            'text': chunk
        })

print("CSV saved at:", out_path)


Fetched page: Federated learning — length 31699 characters
CSV saved at: \mnt\data\wiki_corpus.csv


In [5]:
# Embedding + Vector Store (outline). Run after installing packages.
from sentence_transformers import SentenceTransformer
import chromadb
import pandas as pd
from chromadb.config import Settings
from chromadb.utils import embedding_functions

df = pd.read_csv('/mnt/data/data/wiki_corpus.csv')
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

# Create Chroma client and collection (default persistent DB will be created under chroma.db)
client = chromadb.Client(Settings())
collection = client.create_collection('wiki_ai')
collection.add(ids=df['id'].tolist(), metadatas=[{'title':t} for t in df['title'].tolist()], documents=df['text'].tolist(), embeddings=embeddings.tolist())
print('Upserted', len(df), 'chunks to ChromaDB')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserted 15 chunks to ChromaDB


In [6]:
# RAG Pipeline 
from langchain_community.llms import Ollama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# 1. Embeddings
emb_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Vectorstore
db = Chroma(
    persist_directory="/mnt/data/chroma_db",
    collection_name="wiki_ai",
    embedding_function=emb_fn
)

# 3. Retriever
retriever = db.as_retriever(search_kwargs={"k": 5})

# 4. LLM
llm = Ollama(model="mistral", temperature=0.3)

# 5. Prompt personalizado
template = """You are an expert in federated learning. Use only the following context to answer the question.
If the context doesn't contain the answer, say "I don't know based on the provided data."

Context:
{context}

Question: {question}
Answer (detailed, 400–500 words):"""

prompt = PromptTemplate.from_template(template)

# RAG Chain (SIN langchain.chains)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


  emb_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


RAG Chain listo (sin langchain.chains)!


  llm = Ollama(model="mistral", temperature=0.3)


In [None]:
# Generamos rag_summary.md (400–500 palabras)
from pathlib import Path
import json

query = "Explain federated learning challenges in healthcare."

print(f"Consultando: '{query}'")
raw_docs = retriever.invoke(query)
context = format_docs(raw_docs)
answer = rag_chain.invoke(query)

# Asegurar longitud
word_count = len(answer.split())
if word_count < 400:
    print(f"Ampliando respuesta ({word_count} palabras)...")
    extended_query = "Provide a detailed 450-word summary of federated learning challenges in healthcare, including privacy, data heterogeneity, and regulatory issues."
    answer = rag_chain.invoke(extended_query)

# Guardar resumen
summary_md = f"""# RAG Summary: Federated Learning Challenges in Healthcare

**Query**: {query}  
**Word Count**: {len(answer.split())}  
**Retrieved Chunks**: {len(raw_docs)}  
**Generated on**: November 16, 2025

---

{answer}

---

*Source: Wikipedia via RAG (Manual Chain, ChromaDB, Ollama-Mistral)*  
*No se usó `langchain.chains`*
"""

output_dir = Path("/mnt/data/outputs")
output_dir.mkdir(parents=True, exist_ok=True)
summary_path = output_dir / "rag_summary.md"
summary_path.write_text(summary_md, encoding='utf-8')
print(f"Resumen guardado: {summary_path}")

# Guardar retrieval
retrieval_examples = [
    {
        "chunk_id": doc.metadata.get("id", "unknown"),
        "title": doc.metadata.get("title", "Federated Learning"),
        "text_preview": doc.page_content[:200] + "..."
    }
    for doc in raw_docs
]

examples_path = output_dir / "retrieval_examples.json"
examples_path.write_text(json.dumps(retrieval_examples, indent=2), encoding='utf-8')
