In [7]:
# -*- coding: utf-8 -*-
"""
Experimento de neighbor explosion (dirigido) con Neo4j + py2neo

- Muestra N semillas (nodos objetivo)
- Para cada semilla, cuenta frontier sizes a 1..L hops (dirigido)
- Ajusta dos modelos:
  A) log N_l = log a + b * l * log deg_out(seed)
  B) log N_l = log alpha + l * log m
"""

from py2neo import Graph
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
import math
import time

# =========================
# CONFIGURACIÓN DEL EXPERIMENTO
# =========================

NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "Banco.69"

# Dirección del grafo para la expansión:
# "OUT" para (s)-[*]->(n), "IN" para (s)<-[*]-(n), "BOTH" para no dirigido.
DIRECTION = "OUT"

# Etiqueta(s) y tipo(s) de arista si quieres restringir (opcional)
NODE_LABEL = "Account"          # e.g., "Account"  (o None para todos)
REL_TYPES = "TX"           # e.g., ["TRANSFER", "PAYS"]  (o None para todos)

# Semillas y hops
N_SEEDS = 1000             # número de semillas a muestrear
MAX_HOPS = 3               # p.ej. 3 o 4
REPEAT_EXPERIMENTS = 1     # si quieres repetir el muestreo varias veces (dejamos 1 = una pasada)

# Si tienes APOC y quieres usarlo (un poco más eficiente), pon True
USE_APOC = False

# Randomize con ORDER BY rand() LIMIT N (simple; suficiente para el experimento)
# Si tu grafo es gigantesco y hay sesgo, podrías estratificar por grado o por etiqueta.


# =========================
# CONEXIÓN
# =========================

graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

# Verifica APOC si quieres usarlo
if USE_APOC:
    try:
        _ = graph.run("RETURN apoc.version() AS v").to_data_frame()
    except Exception as e:
        print("APOC no disponible o no habilitado. Cambiando a Cypher puro.")
        USE_APOC = False


# =========================
# HELPERS DE CYPHER
# =========================

def cypher_rel_type_pattern(rel_types):
    """Construye el patrón de tipos de relación para Cypher."""
    if not rel_types:
        return ""
    # Devuelve: :T1|:T2|:T3
    return ":" + "|:".join(rel_types)

REL_PATTERN = cypher_rel_type_pattern(REL_TYPES)

def seed_query(limit, node_label=None):
    if node_label:
        return f"""
        MATCH (n:`{node_label}`)
        WITH n ORDER BY rand()
        LIMIT {limit}
        RETURN id(n) AS id
        """
    else:
        return f"""
        MATCH (n)
        WITH n ORDER BY rand()
        LIMIT {limit}
        RETURN id(n) AS id
        """

def out_degree_query(node_id, rel_pattern):
    # out-degree de la semilla (dirigido saliente)
    if rel_pattern:
        return f"""
        MATCH (n)-[r{rel_pattern}]->()
        WHERE id(n) = {node_id}
        RETURN count(r) AS deg
        """
    else:
        return f"""
        MATCH (n)-[r]->()
        WHERE id(n) = {node_id}
        RETURN count(r) AS deg
        """

def frontier_count_cypher(seed_id, hop, direction="OUT", rel_pattern=""):
    """
    Cuenta nodos a distancia EXACTA = hop (no acumulado), dirigidos según 'direction'.
    Usamos DISTINCT para evitar repeticiones.
    IMPORTANTE: la longitud *hop* se fija en el patrón (no se puede parametrizar directamente).
    """
    if direction.upper() == "OUT":
        arrow = "->"
        left = "-"
        right = "->"
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]->"
    elif direction.upper() == "IN":
        arrow = "<-"
        left = "<-"
        right = "-"
        pattern = f"<-[r{rel_pattern}*{hop}..{hop}]-"
    else:  # BOTH (no dirigido)
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]-"

    q = f"""
    MATCH (s)
    WHERE id(s) = {seed_id}
    MATCH (s){pattern}(n)
    WITH DISTINCT n
    RETURN count(n) AS frontier_size
    """
    return q

def frontier_count_apoc(seed_id, hop, direction="OUT", rel_pattern=""):
    """
    Versión con APOC (apoc.path.expandConfig) para exact hop.
    Uniqueness global de nodos para evitar ciclos.
    """
    # APOC usa minLevel/maxLevel; para "exact hop" contamos nodos con esa distancia exacta.
    # expandConfig con bfs:true y uniqueness:'NODE_GLOBAL' evita revisitar.
    dir_map = {"OUT":"OUTGOING","IN":"INCOMING","BOTH":"BOTH"}
    apoc_dir = dir_map.get(direction.upper(), "OUTGOING")

    # Tipos de relación
    rels = ""
    if rel_pattern:
        # rel_pattern = ":T1|:T2" -> apoc quiere lista ["T1","T2"]
        types = [t.replace(":", "") for t in rel_pattern.split("|")]
        rels = f", relationshipFilter:'{'|'.join(types)}'"

    q = f"""
    MATCH (s) WHERE id(s) = {seed_id}
    CALL apoc.path.expandConfig(s, {{
        minLevel:{hop}, maxLevel:{hop},
        bfs:true, uniqueness:'NODE_GLOBAL',
        filterStartNode:true,
        {'' if not rels else rels}
        direction:'{apoc_dir}'
    }}) YIELD path
    WITH last(nodes(path)) AS n
    RETURN count(DISTINCT n) AS frontier_size
    """
    return q


# =========================
# EXPERIMENTO
# =========================

def sample_seeds(n, node_label=None):
    return graph.run(seed_query(n, node_label)).to_data_frame()["id"].tolist()

def get_out_degree(node_id):
    return int(graph.run(out_degree_query(node_id, REL_PATTERN)).to_data_frame()["deg"].iloc[0])

def count_frontier(seed_id, hop):
    if USE_APOC:
        q = frontier_count_apoc(seed_id, hop, DIRECTION, REL_PATTERN)
    else:
        q = frontier_count_cypher(seed_id, hop, DIRECTION, REL_PATTERN)
    df = graph.run(q).to_data_frame()
    return int(df["frontier_size"].iloc[0])




In [8]:
# === CONFIG EXTRA ===
USE_ELEMENT_ID = True  # True: usa elementId(n); False: usa id(n)
SEED_PROP = None       # Si prefieres muestrear por una propiedad (ej. 'numeroCuenta'), deja None para aleatorio

def seed_query(limit, node_label=None):
    ident = "elementId(n) AS seed_eid" if USE_ELEMENT_ID else "id(n) AS seed_id"
    label_part = f":`{node_label}`" if node_label else ""
    if SEED_PROP:
        # Muestreo por propiedad existente (ej. numeroCuenta) si quieres controlarlo
        return f"""
        MATCH (n{label_part})
        WHERE exists(n.`{SEED_PROP}`)
        WITH n ORDER BY rand() LIMIT {limit}
        RETURN {ident}
        """
    else:
        return f"""
        MATCH (n{label_part})
        WITH n ORDER BY rand() LIMIT {limit}
        RETURN {ident}
        """

def sample_seeds(n, node_label=None):
    df = graph.run(seed_query(n, node_label)).to_data_frame()
    # Intenta ambas columnas por si acaso
    if USE_ELEMENT_ID:
        col = "seed_eid" if "seed_eid" in df.columns else list(df.columns)[0]
        return df[col].tolist()
    else:
        col = "seed_id" if "seed_id" in df.columns else list(df.columns)[0]
        return df[col].tolist()

def out_degree_query_eid(seed_eid, rel_pattern):
    # OUT-degree usando elementId
    if rel_pattern:
        return f"""
        MATCH (n)-[r{rel_pattern}]->()
        WHERE elementId(n) = '{seed_eid}'
        RETURN count(r) AS deg
        """
    else:
        return f"""
        MATCH (n)-[r]->()
        WHERE elementId(n) = '{seed_eid}'
        RETURN count(r) AS deg
        """

def out_degree_query_id(seed_id, rel_pattern):
    # OUT-degree usando id()
    if rel_pattern:
        return f"""
        MATCH (n)-[r{rel_pattern}]->()
        WHERE id(n) = {int(seed_id)}
        RETURN count(r) AS deg
        """
    else:
        return f"""
        MATCH (n)-[r]->()
        WHERE id(n) = {int(seed_id)}
        RETURN count(r) AS deg
        """

def get_out_degree(seed_identifier):
    if USE_ELEMENT_ID:
        q = out_degree_query_eid(seed_identifier, REL_PATTERN)
    else:
        q = out_degree_query_id(seed_identifier, REL_PATTERN)
    return int(graph.run(q).to_data_frame()["deg"].iloc[0])

def frontier_count_cypher_eid(seed_eid, hop, direction="OUT", rel_pattern=""):
    if direction.upper() == "OUT":
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]->"
    elif direction.upper() == "IN":
        pattern = f"<-[r{rel_pattern}*{hop}..{hop}]-"
    else:
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]-"
    return f"""
    MATCH (s) WHERE elementId(s) = '{seed_eid}'
    MATCH (s){pattern}(n)
    WITH DISTINCT n
    RETURN count(n) AS frontier_size
    """

def frontier_count_cypher_id(seed_id, hop, direction="OUT", rel_pattern=""):
    if direction.upper() == "OUT":
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]->"
    elif direction.upper() == "IN":
        pattern = f"<-[r{rel_pattern}*{hop}..{hop}]-"
    else:
        pattern = f"-[r{rel_pattern}*{hop}..{hop}]-"
    return f"""
    MATCH (s) WHERE id(s) = {int(seed_id)}
    MATCH (s){pattern}(n)
    WITH DISTINCT n
    RETURN count(n) AS frontier_size
    """

def frontier_count(seed_identifier, hop):
    if USE_APOC:
        # versión APOC con elementId/id:
        dir_map = {"OUT":"OUTGOING","IN":"INCOMING","BOTH":"BOTH"}
        apoc_dir = dir_map.get(DIRECTION.upper(), "OUTGOING")
        rels = ""
        if REL_PATTERN:
            types = [t.replace(":", "") for t in REL_PATTERN.split("|")]
            rels = f", relationshipFilter:'{'|'.join(types)}'"
        if USE_ELEMENT_ID:
            q = f"""
            MATCH (s) WHERE elementId(s) = '{seed_identifier}'
            CALL apoc.path.expandConfig(s, {{
                minLevel:{hop}, maxLevel:{hop},
                bfs:true, uniqueness:'NODE_GLOBAL',
                filterStartNode:true,
                direction:'{apoc_dir}'{rels}
            }}) YIELD path
            WITH last(nodes(path)) AS n
            RETURN count(DISTINCT n) AS frontier_size
            """
        else:
            q = f"""
            MATCH (s) WHERE id(s) = {int(seed_identifier)}
            CALL apoc.path.expandConfig(s, {{
                minLevel:{hop}, maxLevel:{hop},
                bfs:true, uniqueness:'NODE_GLOBAL',
                filterStartNode:true,
                direction:'{apoc_dir}'{rels}
            }}) YIELD path
            WITH last(nodes(path)) AS n
            RETURN count(DISTINCT n) AS frontier_size
            """
        return int(graph.run(q).to_data_frame()["frontier_size"].iloc[0])
    else:
        # Cypher puro
        if USE_ELEMENT_ID:
            q = frontier_count_cypher_eid(seed_identifier, hop, DIRECTION, REL_PATTERN)
        else:
            q = frontier_count_cypher_id(seed_identifier, hop, DIRECTION, REL_PATTERN)
        return int(graph.run(q).to_data_frame()["frontier_size"].iloc[0])

# Ejemplo de uso (idéntico al bucle anterior):
# seeds = sample_seeds(N_SEEDS, NODE_LABEL)
# for sid in seeds:
#     deg_out = get_out_degree(sid)
#     for l in range(1, MAX_HOPS+1):
#         frontier_sz = frontier_count(sid, l)


In [9]:
all_records = []

start = time.time()
for rep in range(REPEAT_EXPERIMENTS):
    print(f"[Iteración {rep+1}/{REPEAT_EXPERIMENTS}] Muestreando {N_SEEDS} semillas…")
    seeds = sample_seeds(N_SEEDS, NODE_LABEL)

    for sid in tqdm(seeds, desc="Semillas"):
        try:
            deg_out = get_out_degree(sid)
        except Exception as e:
            # si algo falla con esta semilla, saltarla
            continue

        # Recolectar tamaños del frente para hops 1..MAX_HOPS
        for l in range(1, MAX_HOPS + 1):
            try:
                frontier_sz = count_frontier(sid, l)
            except Exception as e:
                frontier_sz = None

            all_records.append({
                "seed_id": sid,
                "hop": l,
                "frontier_size": frontier_sz,
                "deg_out_seed": deg_out,
                "direction": DIRECTION
            })

elapsed = time.time() - start
print(f"Listo. Recolectadas {len(all_records)} filas en {elapsed:.1f}s.")

df = pd.DataFrame(all_records).dropna(subset=["frontier_size"])
df.head()
print(df.groupby("hop")["frontier_size"].describe())


# =========================
# AJUSTE DE MODELOS
# =========================

# --- Modelo A (tu hipótesis):
# log N_l = log a + b * l * log(deg_out_seed)
# -> y = c0 + c1 * X, donde X = l * log(deg_out_seed)

df_A = df.copy()
# evitar log(0): reemplaza deg=0 por 1 (o filtra)
df_A = df_A[df_A["deg_out_seed"] > 0].copy()
df_A["X"] = df_A["hop"] * np.log(df_A["deg_out_seed"])
df_A["y"] = np.log(np.clip(df_A["frontier_size"].astype(float), 1.0, None))

reg_A = LinearRegression().fit(df_A[["X"]], df_A["y"])
c0_A = reg_A.intercept_
c1_A = reg_A.coef_[0]

# Parámetros interpretables
a_hat = math.exp(c0_A)
b_hat = c1_A  # porque y = log a + b * l * log deg

# R^2
r2_A = reg_A.score(df_A[["X"]], df_A["y"])

print("\n=== Modelo A (tu hipótesis) ===")
print(f"log N_l = log a + b * l * log deg_out(seed)")
print(f"a_hat = {a_hat:.4f}")
print(f"b_hat = {b_hat:.4f}")
print(f"R^2   = {r2_A:.4f}")

# --- Modelo B (branching/exponencial, global):
# log N_l = log alpha + l * log m
# -> y = c0 + c1 * l
df_B = df.copy()
df_B["y"] = np.log(np.clip(df_B["frontier_size"].astype(float), 1.0, None))
df_B["l"] = df_B["hop"].astype(float)

reg_B = LinearRegression().fit(df_B[["l"]], df_B["y"])
c0_B = reg_B.intercept_
c1_B = reg_B.coef_[0]

alpha_hat = math.exp(c0_B)
m_hat = math.exp(c1_B)
r2_B = reg_B.score(df_B[["l"]], df_B["y"])

print("\n=== Modelo B (branching/exponencial) ===")
print(f"log N_l = log alpha + l * log m")
print(f"alpha_hat = {alpha_hat:.4f}")
print(f"m_hat     = {m_hat:.4f}  (factor de ramificación promedio por hop)")
print(f"R^2       = {r2_B:.4f}")

# =========================
# VARIANTE DIRIGIDA (estimador size-biased de m)
# =========================
# Estimar m como el out-degree medio de nodos alcanzados siguiendo una arista saliente "al azar"
# (size-biased por in-degree). Lo calculamos muestreando M aristas salientes de semillas.

M_EDGES_PER_SEED = 5  # ajusta si quieres
sampled_out_degs = []

print("\nEstimando m (dirigido, size-biased) por muestreo de aristas salientes…")
for sid in tqdm(df["seed_id"].drop_duplicates().tolist()[:N_SEEDS], desc="Semillas para m"):
    # vecinos de hop=1
    if USE_APOC:
        q1 = frontier_count_apoc(sid, 1, DIRECTION, REL_PATTERN).replace("RETURN count(DISTINCT n) AS frontier_size",
            "RETURN collect(DISTINCT id(n)) AS nbrs")
    else:
        # Cypher básico para obtener vecinos (no solo el count)
        if DIRECTION.upper() == "OUT":
            pattern = f"-[r{REL_PATTERN}]->"
        elif DIRECTION.upper() == "IN":
            pattern = f"<-[r{REL_PATTERN}]-"
        else:
            pattern = f"-[r{REL_PATTERN}]-"
        q1 = f"""
        MATCH (s) WHERE id(s) = {sid}
        MATCH (s){pattern}(n)
        RETURN collect(DISTINCT id(n)) AS nbrs
        """
    try:
        nbrs = graph.run(q1).to_data_frame()["nbrs"].iloc[0]
    except Exception:
        continue
    if not nbrs:
        continue

    # muestrear algunos vecinos
    take = min(M_EDGES_PER_SEED, len(nbrs))
    pick = np.random.choice(nbrs, size=take, replace=False)

    # out-degree de esos vecinos (dirigido OUT)
    for vid in pick:
        try:
            deg_v = get_out_degree(int(vid))
            sampled_out_degs.append(deg_v)
        except Exception:
            pass

if len(sampled_out_degs) > 0:
    m_size_biased = float(np.mean(sampled_out_degs))
    print(f"m (size-biased, estimado) ≈ {m_size_biased:.4f}")
    # Comparar con m_hat del modelo B
    print(f"Comparación: m_hat (regresión global) = {m_hat:.4f}")
else:
    print("No se pudo estimar m_size_biased (sin vecinos muestreados).")

# =========================
# SALIDA
# =========================

# Guardar resultados a disco
df.to_csv("neighbor_explosion_samples.csv", index=False)
print("\nGuardado: neighbor_explosion_samples.csv")

# Un resumen útil por hop
summary = df.groupby("hop")["frontier_size"].agg(["count","mean","median","std","min","max"]).reset_index()
print("\nResumen por hop:")
print(summary.to_string(index=False))

summary.to_csv("neighbor_explosion_summary.csv", index=False)
print("Guardado: neighbor_explosion_summary.csv")

print("\nHecho.")

[Iteración 1/1] Muestreando 1000 semillas…


IndexError: list index out of range