# GraphRAG

In [None]:
import os
import asyncio
import textwrap
import tiktoken
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

# --- GraphRAG imports (API + utils) ---
import graphrag.api as api
from graphrag.config.load_config import load_config
from graphrag.vector_stores.lancedb import LanceDBVectorStore
from graphrag.vector_stores.base import VectorStoreDocument
from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey

# (Para local search con clases de bajo nivel)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.query.structured_search.local_search.mixed_context import LocalSearchMixedContext
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
    read_indexer_covariates,
)

In [None]:
# ------------------------------------------------------------
# 1) Preparar workspace de GraphRAG
# ------------------------------------------------------------
LEVEL = 2  # usar el mismo que en el global_search
ROOT = Path("./graphrag_ws").resolve()
INPUT = ROOT / "input"
OUTPUT = ROOT / "output"
CACHE = ROOT / "cache"
LANCEDB = ROOT / "lancedb"
LANCEDB_URI = (ROOT / "lancedb").as_posix()
ROOT.mkdir(parents=True, exist_ok=True)
INPUT.mkdir(exist_ok=True); OUTPUT.mkdir(exist_ok=True)
CACHE.mkdir(exist_ok=True); LANCEDB.mkdir(exist_ok=True)

In [None]:
# Muestra ejemplo de texto de entrada (puedes poner tus .txt en ./input)
sample = textwrap.dedent("""
    Charles Dickens wrote A Christmas Carol. Scrooge is visited by three ghosts.
    Bob Cratchit works for Scrooge. Tiny Tim is Bob Cratchit's son.
""").strip()
(INPUT / "demo.txt").write_text(sample, encoding="utf-8")

In [None]:
# settings.yaml mínimo (OpenAI). Para Azure OpenAI, sustituye el bloque models según docs.
# Docs de configuración: modelos / input / output / vector_store (lancedb por defecto). 
# https://microsoft.github.io/graphrag/config/yaml/
settings = f"""
models:
  default_chat_model:
    api_key: ${{GRAPHRAG_API_KEY}}
    type: openai_chat
    model: gpt-4o
    model_supports_json: true
  default_embedding_model:
    api_key: ${{GRAPHRAG_API_KEY}}
    type: openai_embedding
    model: text-embedding-3-large
input:
  type: file
  base_dir: input
  file_type: text
chunks:
  size: 1200
  overlap: 150
output:
  type: file
  base_dir: output
cache:
  type: file
  base_dir: cache
vector_store:
  default_vector_store:
    type: lancedb
    db_uri: {LANCEDB.as_posix()}
    container_name: default
"""
(ROOT / "settings.yaml").write_text(settings.strip(), encoding="utf-8")

In [None]:
# ------------------------------------------------------------
# 2) Indexar con la API de GraphRAG (equivale a `graphrag index --root ...`)
# ------------------------------------------------------------
async def build_index():
    cfg = load_config(ROOT)  # lee settings.yaml + .env
    run_results = await api.build_index(config=cfg)  # lista de workflows ejecutados
    for wf in run_results:
        print(f"[INDEX] {wf.workflow}: {'OK' if not wf.errors else 'ERROR'}")
        if wf.errors:
            for e in wf.errors: print("   ->", e)
    return cfg

In [None]:
def _ensure_entity_description_embeddings(entities_df, store, embedder):
    """
    Crea (si no existe) y rellena la colección de LanceDB para descripciones de entidades.
    - Usa la columna 'description_embedding' si existe; si no, calcula embeddings con embedder.
    - 'store' es tu LanceDBVectorStore ya conectado.
    """
    # Si la colección no está creada, load_documents la creará.
    docs = []
    # Normaliza nombres de columnas por si cambian mayúsculas/minúsculas
    cols = {c.lower(): c for c in entities_df.columns}
    col_id = cols.get("id", "id")
    col_title = cols.get("title", "title")
    col_desc = cols.get("description", "description")
    col_desc_emb = cols.get("description_embedding")  # puede no existir

    for _, row in entities_df.iterrows():
        ent_id = str(row[col_id])
        title = str(row.get(col_title, "") or "")
        desc = str(row.get(col_desc, "") or "")
        text = (desc or title).strip()
        if not text:
            continue

        vec = None
        if col_desc_emb and row.get(col_desc_emb) is not None:
            vec = row[col_desc_emb]
        if vec is None:
            # calcula embedding sincrónicamente (tu LocalSearch usa API sync igualmente)
            vec = embedder.embed(text)

        docs.append(
            VectorStoreDocument(
                id=ent_id,
                text=text,
                vector=vec,
                attributes={"title": title},
            )
        )

    if docs:
        # Crea o sobreescribe la colección con estos documentos
        store.load_documents(documents=docs, overwrite=True)

In [None]:
# ------------------------------------------------------------
# 3) Consultas
#   a) Global Search (dataset‑wide, sobre community reports)
#   b) Local Search (entity‑centric, mezcla KG + pasajes)
#   Referencias API/notebooks: 
#     - Global: api.global_search + docs Query/Global Search
#     - Local: clases LocalSearch + LocalSearchMixedContext + notebooks
# ------------------------------------------------------------
async def run_queries(cfg):
    # --- GLOBAL SEARCH (si ya lo tienes OK, déjalo igual) ---
    entities_df = pd.read_parquet(OUTPUT / "entities.parquet")
    communities_df = pd.read_parquet(OUTPUT / "communities.parquet")
    community_reports_df = pd.read_parquet(OUTPUT / "community_reports.parquet")

    response_glob, context_glob = await api.global_search(
        config=cfg,
        entities=entities_df,
        communities=communities_df,
        community_reports=community_reports_df,
        community_level=LEVEL,
        dynamic_community_selection=False,
        response_type="Multiple Paragraphs",
        query="¿Cuáles son los temas principales del texto?",
    )
    print("\n=== GLOBAL SEARCH ===")
    print(response_glob)

    # --- LOCAL SEARCH ---
    # 1) DataFrames de todos los artefactos
    entity_df = entities_df
    community_df = communities_df
    relationship_df = pd.read_parquet(OUTPUT / "relationships.parquet")
    report_df = community_reports_df
    text_unit_df = pd.read_parquet(OUTPUT / "text_units.parquet")

    # (opcional)
    try:
        covariate_df = pd.read_parquet(OUTPUT / "covariates.parquet")
    except FileNotFoundError:
        covariate_df = None

    # 2) Adaptadores TIPADOS (fíjate en las firmas)
    entities = read_indexer_entities(entity_df, community_df, LEVEL)
    relationships = read_indexer_relationships(relationship_df)
    reports = read_indexer_reports(report_df, community_df, LEVEL)
    text_units = read_indexer_text_units(text_unit_df)
    covariates = {"claims": read_indexer_covariates(covariate_df)} if covariate_df is not None else None

    api_key = os.environ["GRAPHRAG_API_KEY"]
    llm_model = "gpt-4o"               # opcional, solo para token_encoder; puedes leerlo del cfg
    embedding_model = "text-embedding-3-large"  # debe coincidir con tu settings.yaml

    LANCEDB_URI = LANCEDB.as_posix()

    description_embedding_store = LanceDBVectorStore(
        collection_name="entity_description_embeddings"  # nombre estándar que usa el indexer
    )
    description_embedding_store.connect(db_uri=LANCEDB_URI)

    embed_config = LanguageModelConfig(
        api_key=api_key,
        type=ModelType.OpenAIEmbedding,
        model=embedding_model,
        max_retries=20,
    )

    mm = ModelManager()
    text_embedder = mm.get_or_create_embedding_model(
        name="local_search_embedding",
        model_type=ModelType.OpenAIEmbedding,
        config=embed_config,
    )
    token_encoder = tiktoken.encoding_for_model(llm_model)  # o tiktoken.get_encoding("cl100k_base")

    chat_type = (cfg.models["default_chat_model"].type).lower()
    chat_model_name = cfg.models["default_chat_model"].model

    if chat_type == "openai_chat":
        mt = ModelType.OpenAIChat
        chat_config = LanguageModelConfig(
            api_key=api_key,
            type=mt,
            model=chat_model_name,
            max_retries=20,
        )
    elif chat_type == "azure_openai_chat":
        mt = ModelType.AzureOpenAIChat
        chat_config = LanguageModelConfig(
            api_key=api_key,
            type=mt,
            model=chat_model_name,                               # opcional según tu setup
            api_base=cfg["models"]["default_chat_model"]["api_base"],
            api_version=cfg["models"]["default_chat_model"]["api_version"],
            deployment_name=cfg["models"]["default_chat_model"]["deployment_name"],
            max_retries=20,
        )
    else:
        raise ValueError(f"Tipo de chat no soportado: {chat_type}")

    chat_model = mm.get_or_create_chat_model(
        name="local_search_chat",
        model_type=mt,
        config=chat_config,
    )

    # --- LLAMADA: crea la colección si falta ---
    if getattr(description_embedding_store, "document_collection", None) is None:
        print("[INFO] Creando colección de embeddings de entidades en LanceDB…")
        _ensure_entity_description_embeddings(entities_df, description_embedding_store, text_embedder)

    # Sonda rápida: ahora debe existir y responder
    probe_vec = text_embedder.embed("Scrooge")
    _ = description_embedding_store.similarity_search_by_vector(probe_vec, k=1)
    print("[OK] La colección de embeddings de entidades responde a búsquedas.")

    # 3) Contexto y búsqueda local
    ctx_builder = LocalSearchMixedContext(
        community_reports=reports,
        text_units=text_units,
        entities=entities,
        relationships=relationships,
        covariates=covariates,  # o None si no tienes
        entity_text_embeddings=description_embedding_store,
        # La mayoría de builds usan el ID interno de la entidad para indexar las embeddings:
        embedding_vectorstore_key=EntityVectorStoreKey.ID,
        # Si las indexaste por título de entidad, usa: EntityVectorStoreKey.TITLE
        text_embedder=text_embedder,
        token_encoder=token_encoder,
    )
    local = LocalSearch(model=chat_model, context_builder=ctx_builder)

    print(f"Local search object: {local}")

    result_local = await local.search(
        query="¿Quién es Scrooge y qué relaciones clave tiene?"
    )
    print("\n=== LOCAL SEARCH ===")
    print(result_local.response)

In [None]:
async def main():
    cfg = await build_index()
    await run_queries(cfg)

In [None]:
await main()