In [1]:
!git clone https://github.com/Jkoeppens/Leak-Project
%cd leak-project
!pip install -r requirements.txt

from google.colab import drive
drive.mount("/content/drive")

# Deine persönliche local.yaml in Drive ablegen:
local_yaml_drive = "/content/drive/MyDrive/leak-project/config/local.yaml"
import os, textwrap, pathlib
pathlib.Path(local_yaml_drive).parent.mkdir(parents=True, exist_ok=True)
with open(local_yaml_drive, "w") as f:
    f.write(textwrap.dedent("""\
    paths:
      root: "/content/drive/MyDrive/leak-project"
    """))

# Link/Copy (einfachste Variante): local.yaml ins Repo-config spiegeln
!cp /content/drive/MyDrive/leak-project/config/local.yaml config/local.yaml

from pipe.config import load_config
cfg = load_config()
cfg["paths"], cfg["outputs"]


Cloning into 'Leak-Project'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 25 (delta 3), reused 25 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 1.01 MiB | 36.86 MiB/s, done.
Resolving deltas: 100% (3/3), done.
[Errno 2] No such file or directory: 'leak-project'
/content
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0mMounted at /content/drive
cp: cannot create regular file 'config/local.yaml': No such file or directory


ModuleNotFoundError: No module named 'pipe'

In [None]:
from pipe.config import load_config
from pipe.io import load_edges_prepared
from pipe.diagnostics import outside_world_stats, save_outside_world_report

cfg = load_config()
E = load_edges_prepared(cfg)

stats = outside_world_stats(E, ow_label="Outside world")
print(stats)  # früheres print(...) ersetzt

# optional persistieren:
saved = save_outside_world_report(stats, cfg)
print("Report gespeichert:", saved)


In [None]:
from pipe.config import load_config
from pipe.prep import prepare_edges_and_report

cfg = load_config()
E_prepared, prep_path, rep = prepare_edges_and_report(cfg)
print("[OK] prepared:", prep_path)
print(rep)


In [None]:
from pipe.config import load_config
from pipe.io import load_levels
from pipe.diagnose_levels import analyze_levels

cfg = load_config()  # lädt default.yaml + optional local.yaml
print("[INFO] Levels-CSV:", cfg["paths"]["levels"])

H = load_levels(cfg["paths"]["levels"])
df_stats = analyze_levels(H, cfg["outputs"]["cutoffs_dir"], plot_levels=("level_2","level_3"))
print("[OK] Level-Diagnose →", cfg["outputs"]["cutoffs_dir"])
df_stats.head()


In [None]:
from pipe.config import load_config
from pipe.dynamic_levels import build_dynamic_selection

cfg = load_config()
sel = build_dynamic_selection(cfg)
print("[OK] selection_dynamic.json & selection_meta.json in:", cfg["outputs"]["org_dir"])

# kurze Übersicht:
for LV in sel["levels"]:
    m = sel["meta"].get(LV, {})
    if m:
        print(f"{LV}: min_size={m.get('min_size')}, coverage={m.get('coverage')}, kept={m.get('kept')}/{m.get('clusters_total')}")
    i = sel["levels"].index(LV)
    if i < len(sel["levels"])-1:
        child_lv = sel["levels"][i+1]
        cap = sel["meta"].get(child_lv, {}).get("max_children_per_parent")
        if cap is not None:
            print(f"   children cap (for {LV}→{child_lv}): {cap}")


In [None]:
from pipe.config import load_config
from pipe.inspect_levels import inspect_levels_and_write

cfg = load_config()
df_summary, df_pc_over, df_sim = inspect_levels_and_write(cfg)

print("[OK] geschrieben nach:", cfg["outputs"]["cutoffs_dir"])
df_summary.head(), df_pc_over.head(), df_sim.head()


In [None]:
import json
import pandas as pd
from pathlib import Path

from pipe.config import load_config
from pipe.io import load_levels, load_edges_prepared   # oder load_edges, je nach Workflow
from pipe.leaders import compute_leaders, save_leaders, find_external_leaders

cfg = load_config()

# Daten laden
H = load_levels(cfg["paths"]["levels"])
E = pd.read_csv(cfg["paths"]["edges_prepared"]).astype({"src": str, "dst": str, "weight": float})

# Selection (aus D)
sel_path = Path(cfg["outputs"]["org_dir"]) / "selection_dynamic.json"
selection = json.loads(sel_path.read_text())

# Policy/Parameter
iconf = cfg.get("filters", {}).get("internal", {})
internal_mode = (iconf.get("mode", "outside_supernode") or "outside_supernode").lower()
internal_domains = tuple(iconf.get("domains", ["@enron.com"]))
allow_external = bool(iconf.get("allow_external_leaders", False))

pct_exec = float(cfg.get("thresholds", {}).get("pct_exec", 99.9))
bt_cap   = int(cfg.get("runtime", {}).get("bt_sample_cap", 2000))
seed     = int(cfg.get("runtime", {}).get("random_seed", 42))

# Berechnen
leaders = compute_leaders(
    H, E, selection,
    pct_exec=pct_exec, bt_cap=bt_cap, seed=seed,
    internal_mode=internal_mode, internal_domains=internal_domains,
    allow_external_leaders=allow_external
)

# Speichern & QA
j_path, csv_path = save_leaders(leaders, cfg)
print("[OK] leaders.json →", j_path)
print("[OK] leaders_flat.csv →", csv_path)

bad = find_external_leaders(leaders, internal_domains)
print("Execs (n):", len(leaders["execs"]))
for lv in selection["levels"]:
    print(f"{lv}: {len(leaders['levels'][lv])} Leader-Einträge")
print("Externe Leader gefunden:", len(bad))
if bad[:5]:
    print("Beispiele:", bad[:5])


In [None]:
from pipe.config import load_config
from pipe.signif import run_significance

cfg = load_config()  # default.yaml + (optional) local.yaml
summary = run_significance(cfg, use_prepared_edges=True, eps=1e-9)

print("[OK] Signifikanz-CSV(s) →", cfg["outputs"]["signif_dir"])
summary.head()

In [None]:
from pipe.config import load_config
from pipe.signif_debug import run_signif_debug

cfg = load_config()
summary, outliers, notes = run_signif_debug(cfg, topk=10)
print(notes)
print("[OUT] z_debug_summary.csv  →", cfg["outputs"]["signif_dir"])
print("[OUT] z_debug_outliers.csv →", cfg["outputs"]["signif_dir"])
summary.head(), outliers.head()


In [None]:
from pipe.config import load_config
from pipe.leader_select import build_leaders_by_degree

cfg = load_config()
leaders = build_leaders_by_degree(cfg, use_prepared_edges=True)

print("[OK] leaders.json & leaders_flat.csv →", cfg["outputs"]["org_dir"])
print("Execs (n):", len(leaders["execs"]))
for lv in leaders["levels"]:
    print(f"{lv}: {len(leaders['levels'][lv])} Leader-Einträge")


In [None]:
from pathlib import Path
import json
import pandas as pd

from pipe.config import load_config
from pipe.io import load_levels
from pipe.orgchart_html import build_org_html

# optional:
try:
    from pipe.persons import ensure_persons
except ImportError:
    ensure_persons = lambda cfg: pd.DataFrame()

cfg = load_config()

# Daten
H = load_levels(cfg["paths"]["levels"])
P = ensure_persons(cfg)

# Auswahl & Leader
sel_path  = Path(cfg["outputs"]["org_dir"]) / "selection_dynamic.json"
lead_path = Path(cfg["outputs"]["org_dir"]) / "leaders.json"
selection = json.loads(sel_path.read_text())
leaders   = json.loads(lead_path.read_text())

# normalize cluster keys (string→int)
leaders["levels"] = {
    lv: {int(k): v for k, v in (mp or {}).items()} for lv, mp in (leaders.get("levels", {}) or {}).items()
}

# (optional) Z-Scores pro Level einsammeln
z_by_level = {}
signif_dir = Path(cfg["outputs"]["signif_dir"])
if signif_dir.exists():
    for p in signif_dir.glob("cluster_significance_level_*.csv"):
        df = pd.read_csv(p)
        if {"cluster_id", "z_score"}.issubset(df.columns):
            lvl = p.stem.replace("cluster_significance_", "")
            z_by_level[lvl] = dict(zip(df["cluster_id"].astype(int), df["z_score"].astype(float)))

# HTML erzeugen
out_html = Path(cfg["outputs"]["org_dir"]) / "organigram_interaktiv.html"
path = build_org_html(
    selection, leaders, H, persons=P,
    out_html=str(out_html),
    max_depth=min(3, len(selection.get("levels", []))),
    z_by_level=z_by_level,
    topics_by_level={},   # kann später gefüllt werden
    physics=False,
    label_template="{level}:{cid} • {leader} • n={n} • z={z:.1f} • k_in={deg_in} • k={deg_global}",
    label_size=18
)
print("[OK] interaktive HTML:", path)