In [None]:
# 1) Störende Pakete entfernen (ziehen falsche NumPy-Versionen rein)
!pip -q uninstall -y numpy pandas numba jax jaxlib opencv-python opencv-contrib-python opencv-python-headless thinc

# 2) Projektkompatible Versionen installieren
!pip -q install "numpy==1.26.4" "pandas==2.2.2"

# 3) Runtime hart neu starten, damit alte Binary-Artefakte raus sind
import os; os.kill(os.getpid(), 9)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.90 requires jax>=0.4.27, which is not installed.
chex 0.1.90 requires jaxlib>=0.4.27, which is not installed.
orbax-checkpoint 0.11.22 requires jax>=0.5.0, which is not installed.
spacy 3.8.7 requires thinc<8.4.0,>=8.3.4, which is not installed.
cudf-cu12 25.6.0 requires numba<0.62.0a0,>=0.59.1, which is not installed.
albucore 0.0.24 requires opencv-python-headless>=4.9.0.80, which is not installed.
flax 0.10.6 requires jax>=0.5.1, which is not installed.
albumentations 2.0.8 requires opencv-python-headless>=4.9.0.80, which is not installed.
stumpy 1.13.0 requires numba>=0.57.1, which is not installed.
dask-cuda 25.6.0 requires numba<0.62.0a0,>=0.59.1, which is not installed.
optax 0.2.5 requires jax>=0.4.27, which is not installed.
optax 0.2.5 requires jaxlib>=0.4.27, which is not installed.
umap-l

In [1]:
# --- Robust Setup for Colab: clone/sync repo, install, link local.yaml ---
import os, sys, pathlib, textwrap, subprocess, shutil

GIT_URL  = "https://github.com/Jkoeppens/Leak-Project.git"   # oder SSH: git@github.com:Jkoeppens/Leak-Project.git
REPO_DIR = "/content/Leak-Project"

def run(cmd):
    print("➜", cmd)
    p = subprocess.run(cmd, shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    print(p.stdout.strip())
    return p.returncode

# 1) Clone or hard-sync to remote main (fixes 'divergent branches')
if not os.path.exists(REPO_DIR):
    run(f"git clone {GIT_URL} {REPO_DIR}")
else:
    os.chdir(REPO_DIR)
    run("git fetch origin")
    # Setze lokalen Checkout exakt auf origin/main
    run("git reset --hard origin/main")

# 2) Switch into repo
os.chdir(REPO_DIR)
print("CWD:", os.getcwd())

# 3) Install requirements (quiet)
if os.path.exists("requirements.txt"):
    run("pip -q install -r requirements.txt")
else:
    print("⚠️ Keine requirements.txt im Repo gefunden.")

# 4) Make repo importable
if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

# 5) Mount Drive + ensure local.yaml exists in Drive, then copy/link into repo
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

drive_local = "/content/drive/MyDrive/leak-project/config/local.yaml"
repo_local  = os.path.join(REPO_DIR, "config", "local.yaml")

# Stelle sicher, dass Zielordner existiert
os.makedirs(os.path.join(REPO_DIR, "config"), exist_ok=True)
pathlib.Path(drive_local).parent.mkdir(parents=True, exist_ok=True)

# Minimal local.yaml anlegen, falls sie nicht existiert
if not os.path.exists(drive_local):
    with open(drive_local, "w") as f:
        f.write(textwrap.dedent("""\
        paths:
          root: "/content/drive/MyDrive/leak-project"
        """))
    print("✅ local.yaml neu in Drive angelegt:", drive_local)
else:
    print("✅ local.yaml in Drive gefunden:", drive_local)

# Kopieren (repo-local bleibt git-ignored)
shutil.copyfile(drive_local, repo_local)
print("✅ local.yaml ins Repo gespiegelt:", repo_local)

# 6) Sanity checks: Struktur prüfen
print("\n== Top-level ==")
run("ls -la")
print("\n== Expect these directories ==")
for d in ["config", "pipe"]:
    print(d, "→", "OK" if os.path.isdir(d) else "❌ fehlt")

# 7) Test import
try:
    from pipe.config import load_config
    cfg = load_config()
    print("\n[OK] cfg.paths:", cfg["paths"])
    print("[OK] cfg.outputs:", cfg["outputs"])
except Exception as e:
    print("\n❌ Import/Config-Fehler:", e)


➜ git fetch origin

➜ git reset --hard origin/main
HEAD is now at dfed68e Fix: update requirements for Colab (numpy>=2, pandas>=2.2)
CWD: /content/Leak-Project
➜ pip -q install -r requirements.txt

Mounted at /content/drive
✅ local.yaml in Drive gefunden: /content/drive/MyDrive/leak-project/config/local.yaml
✅ local.yaml ins Repo gespiegelt: /content/Leak-Project/config/local.yaml

== Top-level ==
➜ ls -la
total 44
drwxr-xr-x 6 root root 4096 Aug 22 23:44 .
drwxr-xr-x 1 root root 4096 Aug 22 23:48 ..
drwxr-xr-x 2 root root 4096 Aug 22 23:33 config
-rw-r--r-- 1 root root 8196 Aug 22 23:33 .DS_Store
drwxr-xr-x 8 root root 4096 Aug 22 23:48 .git
-rw-r--r-- 1 root root  168 Aug 22 23:33 .gitignore
drwxr-xr-x 2 root root 4096 Aug 22 23:33 notebooks
drwxr-xr-x 3 root root 4096 Aug 22 23:33 pipe
-rw-r--r-- 1 root root   86 Aug 22 23:44 requirements.txt

== Expect these directories ==
config → OK
pipe → OK

[OK] cfg.paths: {'root': '/content/drive/MyDrive/leak-project', 'raw_dir': '/content/dr

In [2]:
import numpy, pandas
print("NumPy:", numpy.__version__)     # -> 1.26.4
print("pandas:", pandas.__version__)   # -> 2.2.2

NumPy: 1.26.4
pandas: 2.2.2


In [3]:
from pipe.config import load_config
from pipe.io import load_edges_prepared
from pipe.diagnostics import outside_world_stats, save_outside_world_report

cfg = load_config()
E = load_edges_prepared(cfg)

stats = outside_world_stats(E, ow_label="Outside world")
print(stats)  # früheres print(...) ersetzt

# optional persistieren:
saved = save_outside_world_report(stats, cfg)
print("Report gespeichert:", saved)


{'present': True, 'edge_count': 562, 'weight_sum': 35862.0, 'internal_edges': 21800, 'total_edges': 22362, 'internal_ratio': 0.9748680797781951}
Report gespeichert: /content/drive/MyDrive/leak-project/reports/diagnostics/outside_world.json


In [4]:
from pipe.config import load_config
from pipe.prep import prepare_edges_and_report

cfg = load_config()
E_prepared, prep_path, rep = prepare_edges_and_report(cfg)
print("[OK] prepared:", prep_path)
print(rep)


[OK] prepared: /content/drive/MyDrive/leak-project/data_derived/edges_person_prepared.csv
{'nodes': 5950, 'edges': 22362, 'weight_sum': 162529.0, 'weight_min': 1.0, 'weight_max': 21324.0, 'degree_max': 1098, 'degree_p95': 29.0, 'top_degree_nodes': {'david.forster@enron.com': 1098, 'Outside world': 563, 'chantelle.villanueva@enron.com': 472, 'lillian.carroll@enron.com': 424, 'daniel.muschar@enron.com': 423, 'veronica.espinoza@enron.com': 421, 'scott.mills@enron.com': 420, 'deshonda.hamilton@enron.com': 414, 'sally.beck@enron.com': 394, 'daren.farmer@enron.com': 386}, 'level_1_clusters': 64, 'level_2_clusters': 643, 'level_3_clusters': 1109, 'level_4_clusters': 1360, 'level_5_clusters': 1364}


In [6]:
from pathlib import Path
import pandas as pd

# 0) Dependencies sicherstellen (nur falls noch nicht installiert)
try:
    import powerlaw  # noqa
except Exception:
    !pip -q install powerlaw matplotlib

from pipe.config import load_config
from pipe.io import load_levels
from pipe.diagnose_levels import analyze_levels

# 1) Config laden & Pfad prüfen
cfg = load_config()
levels_path = Path(cfg["paths"]["levels"])
print("[INFO] Levels-CSV (aus cfg):", levels_path)

if not levels_path.exists():
    raise FileNotFoundError(
        f"Levels-CSV fehlt: {levels_path}\n"
        "→ Erzeuge sie zuerst (z.B. Infomap/Clustering-Pipeline) "
        "oder passe config/local.yaml an (paths.levels)."
    )

# 2) Datei kurz inspizieren (Header)
print("[INFO] Erste Zeilen:")
display(pd.read_csv(levels_path, nrows=5))

# 3) Laden über pipe.io (macht Typen & Umbenennungen robust)
H = load_levels(str(levels_path))

# 4) Diagnose fahren (Plots + Summary-CSV in outputs.cutoffs_dir)
outdir = cfg["outputs"]["cutoffs_dir"]
df_stats = analyze_levels(H, outdir, plot_levels=("level_2","level_3"))
print("[OK] Level-Diagnose →", outdir)
display(df_stats.head())

[INFO] Levels-CSV (aus cfg): /content/drive/MyDrive/leak-project/data_derived/infomap_levels.csv
[INFO] Erste Zeilen:


Unnamed: 0,node,module_path,level_1,level_2,level_3,level_4,level_5
0,.daigle@enron.com,7:540:1004:1255:1259,7,540,1004,1255,1259
1,.palmer@enron.com,3:331:722:735:735,3,331,722,735,735
2,.ward@enron.com,3:359:756:775:776,3,359,756,775,776
3,01@ftenergy.com,41:620:1086:1337:1341,41,620,1086,1337,1341
4,1.11307318.-1@multexinvestornetwork.com,3:361:758:777:778,3,361,758,777,778


[OK] Level-Diagnose → /content/drive/MyDrive/leak-project/reports/cutoffs


Unnamed: 0,level,n_clusters,n_people,xmin,alpha,ks_D,lr_R,lr_p,k80,k90,kelbow,min_size
0,level_1,64,7955,206,1.94126,0.159484,-0.216754,0.8284,4,6,12,206
1,level_2,643,7955,2,1.532803,0.093493,-2.347124,0.018919,62,108,26,5
2,level_3,1109,7955,2,1.575823,0.083073,-2.354903,0.018528,91,314,58,5
3,level_4,1360,7955,2,1.595701,0.088068,-2.34793,0.018878,116,565,52,5
4,level_5,1364,7955,2,1.59936,0.092762,-2.342475,0.019156,116,569,52,5


In [7]:
from pipe.config import load_config
from pipe.io import load_levels
from pipe.diagnose_levels import analyze_levels

cfg = load_config()  # lädt default.yaml + optional local.yaml
print("[INFO] Levels-CSV:", cfg["paths"]["levels"])

H = load_levels(cfg["paths"]["levels"])
df_stats = analyze_levels(H, cfg["outputs"]["cutoffs_dir"], plot_levels=("level_2","level_3"))
print("[OK] Level-Diagnose →", cfg["outputs"]["cutoffs_dir"])
df_stats.head()


[INFO] Levels-CSV: /content/drive/MyDrive/leak-project/data_derived/infomap_levels.csv
[OK] Level-Diagnose → /content/drive/MyDrive/leak-project/reports/cutoffs


Unnamed: 0,level,n_clusters,n_people,xmin,alpha,ks_D,lr_R,lr_p,k80,k90,kelbow,min_size
0,level_1,64,7955,206,1.94126,0.159484,-0.216754,0.8284,4,6,12,206
1,level_2,643,7955,2,1.532803,0.093493,-2.347124,0.018919,62,108,26,5
2,level_3,1109,7955,2,1.575823,0.083073,-2.354903,0.018528,91,314,58,5
3,level_4,1360,7955,2,1.595701,0.088068,-2.34793,0.018878,116,565,52,5
4,level_5,1364,7955,2,1.59936,0.092762,-2.342475,0.019156,116,569,52,5


In [8]:
from pipe.config import load_config
from pipe.dynamic_levels import build_dynamic_selection

cfg = load_config()
sel = build_dynamic_selection(cfg)
print("[OK] selection_dynamic.json & selection_meta.json in:", cfg["outputs"]["org_dir"])

# kurze Übersicht:
for LV in sel["levels"]:
    m = sel["meta"].get(LV, {})
    if m:
        print(f"{LV}: min_size={m.get('min_size')}, coverage={m.get('coverage')}, kept={m.get('kept')}/{m.get('clusters_total')}")
    i = sel["levels"].index(LV)
    if i < len(sel["levels"])-1:
        child_lv = sel["levels"][i+1]
        cap = sel["meta"].get(child_lv, {}).get("max_children_per_parent")
        if cap is not None:
            print(f"   children cap (for {LV}→{child_lv}): {cap}")


[OK] selection_dynamic.json & selection_meta.json in: /content/drive/MyDrive/leak-project/reports/orgchart
level_1: min_size=206, coverage=0.8, kept=4/64
level_2: min_size=5, coverage=0.641, kept=26/643
level_3: min_size=5, coverage=0.83, kept=58/1109
level_4: min_size=5, coverage=0.795, kept=52/1360
level_5: min_size=5, coverage=0.796, kept=52/1364


In [9]:
from pipe.config import load_config
from pipe.inspect_levels import inspect_levels_and_write

cfg = load_config()
df_summary, df_pc_over, df_sim = inspect_levels_and_write(cfg)

print("[OK] geschrieben nach:", cfg["outputs"]["cutoffs_dir"])
df_summary.head(), df_pc_over.head(), df_sim.head()


[OK] geschrieben nach: /content/drive/MyDrive/leak-project/reports/cutoffs


(     level  n_clusters  min  q25  median  q75    p90     p95      p99   max  \
 0  level_1          64    1  2.0     2.0  4.0  236.1  484.90  2706.08  2822   
 1  level_2         643    1  1.0     1.0  2.0   26.0   68.00   199.44   424   
 2  level_3        1109    1  1.0     1.0  1.0    9.0   31.00   141.44   424   
 3  level_4        1360    1  1.0     1.0  1.0    5.0   21.05   123.82   424   
 4  level_5        1364    1  1.0     1.0  1.0    5.0   21.00   123.74   424   
 
    ...         std  k50  k80  k90  k95  ge_2  ge_3  ge_5  ge_10  sum_ge_5  
 0  ...  488.560590    2    4    6    7    51    21    14      8      7858  
 1  ...   40.338138   20   62  108  246   228   158   135    104      7324  
 2  ...   27.654012   25   91  314  712   263   176   146    109      6833  
 3  ...   24.182713   27  116  565  963   274   180   150    110      6580  
 4  ...   24.148819   27  116  569  967   275   179   148    110      6570  
 
 [5 rows x 21 columns],
                pair  parents 

In [10]:
import json
import pandas as pd
from pathlib import Path

from pipe.config import load_config
from pipe.io import load_levels, load_edges_prepared   # oder load_edges, je nach Workflow
from pipe.leaders import compute_leaders, save_leaders, find_external_leaders

cfg = load_config()

# Daten laden
H = load_levels(cfg["paths"]["levels"])
E = pd.read_csv(cfg["paths"]["edges_prepared"]).astype({"src": str, "dst": str, "weight": float})

# Selection (aus D)
sel_path = Path(cfg["outputs"]["org_dir"]) / "selection_dynamic.json"
selection = json.loads(sel_path.read_text())

# Policy/Parameter
iconf = cfg.get("filters", {}).get("internal", {})
internal_mode = (iconf.get("mode", "outside_supernode") or "outside_supernode").lower()
internal_domains = tuple(iconf.get("domains", ["@enron.com"]))
allow_external = bool(iconf.get("allow_external_leaders", False))

pct_exec = float(cfg.get("thresholds", {}).get("pct_exec", 99.9))
bt_cap   = int(cfg.get("runtime", {}).get("bt_sample_cap", 2000))
seed     = int(cfg.get("runtime", {}).get("random_seed", 42))

# Berechnen
leaders = compute_leaders(
    H, E, selection,
    pct_exec=pct_exec, bt_cap=bt_cap, seed=seed,
    internal_mode=internal_mode, internal_domains=internal_domains,
    allow_external_leaders=allow_external
)

# Speichern & QA
j_path, csv_path = save_leaders(leaders, cfg)
print("[OK] leaders.json →", j_path)
print("[OK] leaders_flat.csv →", csv_path)

bad = find_external_leaders(leaders, internal_domains)
print("Execs (n):", len(leaders["execs"]))
for lv in selection["levels"]:
    print(f"{lv}: {len(leaders['levels'][lv])} Leader-Einträge")
print("Externe Leader gefunden:", len(bad))
if bad[:5]:
    print("Beispiele:", bad[:5])


[OK] leaders.json → /content/drive/MyDrive/leak-project/reports/orgchart/leaders.json
[OK] leaders_flat.csv → /content/drive/MyDrive/leak-project/reports/orgchart/leaders_flat.csv
Execs (n): 5
level_1: 4 Leader-Einträge
level_2: 120 Leader-Einträge
level_3: 88 Leader-Einträge
level_4: 78 Leader-Einträge
level_5: 52 Leader-Einträge
Externe Leader gefunden: 0


In [11]:
from pipe.config import load_config
from pipe.signif import run_significance

cfg = load_config()  # default.yaml + (optional) local.yaml
summary = run_significance(cfg, use_prepared_edges=True, eps=1e-9)

print("[OK] Signifikanz-CSV(s) →", cfg["outputs"]["signif_dir"])
summary.head()

[OK] Signifikanz-CSV(s) → /content/drive/MyDrive/leak-project/reports/significance


Unnamed: 0,cluster_id,n_nodes,z_score,density_observed,level
0,5,249,141.353831,0.021084,level_1
1,7,270,136.654871,0.012226,level_1
2,4,700,125.865788,0.008531,level_1
3,8,206,89.09999,0.010277,level_1
4,2,2822,83.808868,0.001231,level_1


In [12]:
from pipe.config import load_config
from pipe.signif_debug import run_signif_debug

cfg = load_config()
summary, outliers, notes = run_signif_debug(cfg, topk=10)
print(notes)
print("[OUT] z_debug_summary.csv  →", cfg["outputs"]["signif_dir"])
print("[OUT] z_debug_outliers.csv →", cfg["outputs"]["signif_dir"])
summary.head(), outliers.head()


[level_1] Dichte=0: 1 | Dichte≈1: 0 → starke Ausreißer möglich.
[level_2] Dichte=0: 35 | Dichte≈1: 0 → starke Ausreißer möglich.
[level_3] Dichte=0: 38 | Dichte≈1: 0 → starke Ausreißer möglich.
[level_4] Dichte=0: 40 | Dichte≈1: 0 → starke Ausreißer möglich.
[level_5] Dichte=0: 39 | Dichte≈1: 0 → starke Ausreißer möglich.
[OUT] z_debug_summary.csv  → /content/drive/MyDrive/leak-project/reports/significance
[OUT] z_debug_outliers.csv → /content/drive/MyDrive/leak-project/reports/significance


(     level  count       z_mu    z_sigma     z_min      z_p50       z_p90  \
 0  level_1      8  89.121260  46.606090 -0.064190  86.454429  138.064559   
 1  level_2    135  57.321838  46.666224 -0.410991  60.423277  117.786840   
 2  level_3    146  57.509852  46.719147 -0.125539  61.575594  119.109046   
 3  level_4    150  57.460744  47.212026 -0.125539  61.575594  120.481407   
 4  level_5    148  57.222982  46.673221 -0.125539  61.575594  118.615051   
 
         z_max  sigma_near_zero  n_nodes_min  n_nodes_p50  n_nodes_max  
 0  141.353831            False          206        466.0         2822  
 1  165.312970            False            5         23.0          424  
 2  165.312970            False            5         21.0          424  
 3  165.312970            False            5         18.5          424  
 4  165.312970            False            5         19.0          424  ,
      level  cluster_id  n_nodes  m_in_observed     exp_m_in     var_m_in  \
 0  level_1         

In [29]:
from pathlib import Path
import json
import pandas as pd

from pipe.config import load_config
from pipe.io import load_levels
from pipe.orgchart_html import build_org_html

# optional:
try:
    from pipe.persons import ensure_persons
except ImportError:
    ensure_persons = lambda cfg: pd.DataFrame()

cfg = load_config()

# Daten
H = load_levels(cfg["paths"]["levels"])
P = ensure_persons(cfg)

# Auswahl & Leader
sel_path  = Path(cfg["outputs"]["org_dir"]) / "selection_dynamic.json"
lead_path = Path(cfg["outputs"]["org_dir"]) / "leaders.json"
selection = json.loads(sel_path.read_text())
leaders   = json.loads(lead_path.read_text())

# normalize cluster keys (string→int)
leaders["levels"] = {
    lv: {int(k): v for k, v in (mp or {}).items()} for lv, mp in (leaders.get("levels", {}) or {}).items()
}

# (optional) Z-Scores pro Level einsammeln
z_by_level = {}
signif_dir = Path(cfg["outputs"]["signif_dir"])
if signif_dir.exists():
    for p in signif_dir.glob("cluster_significance_level_*.csv"):
        df = pd.read_csv(p)
        if {"cluster_id", "z_score"}.issubset(df.columns):
            lvl = p.stem.replace("cluster_significance_", "")
            z_by_level[lvl] = dict(zip(df["cluster_id"].astype(int), df["z_score"].astype(float)))

# HTML erzeugen
out_html = Path(cfg["outputs"]["org_dir"]) / "organigram_interaktiv.html"
path = build_org_html(
    selection, leaders, H, persons=P,
    out_html=str(out_html),
    max_depth=min(3, len(selection.get("levels", []))),
    z_by_level=z_by_level,
    topics_by_level={},   # kann später gefüllt werden
    physics=False,
    label_template="{level}:{cid} • {leader} • n={n} • z={z:.1f} • k_in={deg_in} • k={deg_global}",
    label_size=18
)
print("[OK] interaktive HTML:", path)

[OK] interaktive HTML: /content/drive/MyDrive/leak-project/reports/orgchart/organigram_interaktiv.html
