## Eda

In [None]:
# ===============================================================
#   EDA – Exploratory Data Analysis für Leistungsband-Projekt
# ===============================================================

from src.main import main
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# ===============================================================
# 1) Daten laden (raw + clean)
# ===============================================================

res = main(run_full_pipeline=False)

raw = res["raw"]
clean = res["clean"]

print("---- Überblick ----")
print("Nodes:", len(clean["measurements"]))
print("Beispiel-Node:", next(iter(clean["measurements"].keys())))
print("Keys im Datenobjekt:", list(clean.keys()))


# ===============================================================
# 2) Zeitachsenanalyse – Frequenz, Gaps, Duplikate
# ===============================================================

freq_summary = []
for nid, s in clean["measurements"].items():
    diffs = s.index.to_series().diff().value_counts()
    freq_summary.append({
        "node": nid,
        "unique_steps": list(diffs.index.astype(str)),
        "step_counts": list(diffs.values),
    })

df_freq = pd.DataFrame(freq_summary)
print("\n---- Frequenz-Analyse ----")
print(df_freq)


# --- Gap Analyse ---
gap_info = {}
for nid, s in clean["measurements"].items():
    expected = pd.date_range(s.index.min(), s.index.max(), freq="15min")
    missing = expected.difference(s.index)
    if len(missing) > 0:
        gap_info[nid] = missing

print("\n---- Gefundene Lücken ----")
print(gap_info if gap_info else "Keine Lücken gefunden.")


# ===============================================================
# 3) NaN-Analyse (vor/nach Cleaning)
# ===============================================================

nan_stats = []
for nid, s_raw in raw["measurements"].items():
    s_clean = clean["measurements"][nid]
    nan_stats.append({
        "node": nid,
        "raw_nans": s_raw.isna().sum(),
        "clean_nans": s_clean.isna().sum(),
        "len_raw": len(s_raw),
        "len_clean": len(s_clean),
    })

df_nan = pd.DataFrame(nan_stats).set_index("node")
print("\n---- NaN Analyse P_MW ----")
print(df_nan)


# --- Wetterdaten NaN Analyse ---
weather_cols = ["temperature_C", "wind_speed_mps", "solar_radiation_Wm2"]

w_stats = []
for nid, df_w_raw in raw["weather_hist"].items():
    df_w_clean = clean["weather_hist"][nid]
    entry = {"node": nid}
    for col in weather_cols:
        entry[f"raw_nan_{col}"] = df_w_raw[col].isna().sum()
        entry[f"clean_nan_{col}"] = df_w_clean[col].isna().sum()
    w_stats.append(entry)

df_w_nan = pd.DataFrame(w_stats).set_index("node")
print("\n---- NaN Analyse Wetter ----")
print(df_w_nan)


# ===============================================================
# 4) Derived Nodes prüfen
# ===============================================================

def check_field_sum(clean, node_id, derived_spec):
    terms = derived_spec["terms"]
    p_der = clean["measurements"][node_id]

    parts = []
    for t in terms:
        nid = t["node"]
        coeff = t["coeff"]
        if nid in clean["measurements"]:
            parts.append(coeff * clean["measurements"][nid])
        else:
            print(f"⚠ Basisnode {nid} fehlt!")
            return None

    p_rec = sum(parts)
    idx = p_der.index.intersection(p_rec.index)
    diff = p_der.loc[idx] - p_rec.loc[idx]

    print(f"→ Derived Node {node_id} – Max Fehler:", diff.abs().max())
    return diff


print("\n---- Derived Node Validierung ----")

for nid, row in raw["nodes"].iterrows():
    if isinstance(row.get("derived"), dict):
        print(f"\nPrüfe Derived Node: {nid}")
        diff = check_field_sum(clean, nid, row["derived"])


# ===============================================================
# 5) Statistische EDA – Min/Max, Trends, Verteilungen
# ===============================================================

stats = []
for nid, s in clean["measurements"].items():
    stats.append({
        "node": nid,
        "min": s.min(),
        "max": s.max(),
        "mean": s.mean(),
        "std": s.std(),
    })

df_stats = pd.DataFrame(stats).set_index("node")
print("\n---- Basisstatistiken P_MW ----")
print(df_stats)


# --- Beispielplot für einen Node ---
example_node = next(iter(clean["measurements"].keys()))
s = clean["measurements"][example_node]

plt.figure(figsize=(12, 4))
s["2024-01-01":"2024-01-07"].plot()
plt.title(f"P_MW – Woche 1/2024 – {example_node}")
plt.grid(True)
plt.show()


# --- Saisonalität / Monatsmittel ---
s_month = s.resample("ME").mean()

plt.figure(figsize=(10,4))
s_month.plot(kind="bar")
plt.title(f"Monatsmittel P_MW – {example_node}")
plt.grid(True)
plt.show()


# ===============================================================
# 6) Wetter-Korrelationen & Forecast-Basis
# ===============================================================

def get_panel(clean, node):
    p = clean["measurements"][node].rename("P_MW")
    w = clean["weather_hist"][node]
    return w.join(p, how="inner")

panel = get_panel(clean, example_node)

print("\n---- Korrelationen zwischen Wetter und P_MW ----")
print(panel.corr())


# ===============================================================
# 7) Sprung- / Jump-Analyse (Diffs & große Schritte)
# ===============================================================

jump_stats = []

for nid, s in clean["measurements"].items():
    # 1. Differenzen
    diff = s.diff()

    # 2. Threshold: z.B. 3 * Standardabweichung der Differenzen
    thr = 3 * diff.std()

    # 3. "große" Sprünge zählen
    big_jumps = (diff.abs() > thr).sum()

    jump_stats.append({
        "node": nid,
        "diff_std": diff.std(),
        "threshold": thr,
        "num_big_jumps": big_jumps,
    })

df_jumps = pd.DataFrame(jump_stats).set_index("node")

print("\n---- Sprung-Analyse P_MW (diff-basierte Jumps) ----")
print(df_jumps.sort_values("num_big_jumps", ascending=False))


# Beispielplot: Diffs für example_node
s_ex = clean["measurements"][example_node]
diff_ex = s_ex.diff()

plt.figure(figsize=(12, 4))
diff_ex["2024-01-01":"2024-01-07"].plot()
plt.axhline(3*diff_ex.std(), linestyle="--")
plt.axhline(-3*diff_ex.std(), linestyle="--")
plt.title(f"Diff P_MW – Woche 1/2024 – {example_node}")
plt.grid(True)
plt.show()


# ===============================================================
# 8) Tages- und Wochenmuster (Daily/Weekly Profiles)
# ===============================================================

# Tagesprofil (Ø über alle Tage)
s_ex = clean["measurements"][example_node].dropna()
df_ex = s_ex.to_frame(name="P_MW")
df_ex["hour"] = df_ex.index.hour
df_ex["weekday"] = df_ex.index.dayofweek  # 0=Mo ... 6=So

hour_profile = df_ex.groupby("hour")["P_MW"].mean()
weekday_profile = df_ex.groupby("weekday")["P_MW"].mean()

print("\n---- Tagesprofil (P_MW, Mittelwert je Stunde) ----")
print(hour_profile)

print("\n---- Wochenprofil (P_MW, Mittelwert je Wochentag; 0=Mo) ----")
print(weekday_profile)

plt.figure(figsize=(10, 4))
hour_profile.plot(marker="o")
plt.title(f"Tagesprofil P_MW – {example_node}")
plt.xlabel("Stunde")
plt.ylabel("P_MW")
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 4))
weekday_profile.plot(kind="bar")
plt.title(f"Wochentagsprofil P_MW – {example_node}")
plt.xlabel("Wochentag (0=Mo)")
plt.ylabel("P_MW")
plt.grid(True)
plt.show()


# Optional: Wetter-Tagesprofil (z.B. Solarstrahlung)
if example_node in clean["weather_hist"]:
    w_ex = clean["weather_hist"][example_node].copy()
    w_ex["hour"] = w_ex.index.hour
    solar_hour_profile = w_ex.groupby("hour")["solar_radiation_Wm2"].mean()

    plt.figure(figsize=(10, 4))
    solar_hour_profile.plot(marker="o")
    plt.title(f"Tagesprofil Solarstrahlung – {example_node}")
    plt.xlabel("Stunde")
    plt.ylabel("W/m²")
    plt.grid(True)
    plt.show()


# ===============================================================
# 9) Korrelationen zwischen Nodes (P_MW vs P_MW)
# ===============================================================

# Panel aller Nodes: gemeinsame Zeitachse
df_all_p = pd.DataFrame({
    nid: s for nid, s in clean["measurements"].items()
})

corr_nodes = df_all_p.corr()

print("\n---- Korrelation P_MW zwischen Nodes ----")
print(corr_nodes)

plt.figure(figsize=(8, 6))
plt.imshow(corr_nodes, interpolation="nearest")
plt.colorbar(label="Korrelationskoeffizient")
plt.xticks(range(len(corr_nodes.columns)), corr_nodes.columns, rotation=90)
plt.yticks(range(len(corr_nodes.index)), corr_nodes.index)
plt.title("Korrelationen P_MW zwischen Nodes")
plt.tight_layout()
plt.show()


# ===============================================================
# 10) Outlier-Analyse (z-Score basierend)
# ===============================================================

outlier_stats = []

Z_THRESHOLD = 4.0  # z.B. |z| > 4 als Outlier

for nid, s in clean["measurements"].items():
    mu = s.mean()
    sigma = s.std()

    if sigma == 0 or np.isnan(sigma):
        num_outliers = 0
    else:
        z = (s - mu) / sigma
        num_outliers = (z.abs() > Z_THRESHOLD).sum()

    outlier_stats.append({
        "node": nid,
        "mean": mu,
        "std": sigma,
        "num_outliers(|z|>4)": num_outliers,
        "outlier_fraction": num_outliers / len(s),
    })

df_outliers = pd.DataFrame(outlier_stats).set_index("node")

print("\n---- Outlier-Analyse P_MW (z-Score) ----")
print(df_outliers.sort_values("outlier_fraction", ascending=False))


# Beispiel: Outlier-Scatter für example_node
s_ex = clean["measurements"][example_node]
mu_ex = s_ex.mean()
sigma_ex = s_ex.std()
z_ex = (s_ex - mu_ex) / sigma_ex

outlier_mask = z_ex.abs() > Z_THRESHOLD

plt.figure(figsize=(12, 4))
plt.plot(s_ex.index, s_ex.values, label="P_MW")
plt.scatter(s_ex.index[outlier_mask], s_ex[outlier_mask], marker="x")
plt.title(f"Outlier-Check P_MW – {example_node} (|z|>{Z_THRESHOLD})")
plt.grid(True)
plt.show()

print("\nEDA erweitert um Sprünge, Tages-/Wochenprofile, Node-Korrelationen und Outlier.")

print("\nEDA abgeschlossen.")


## Graph inspection

In [1]:
# ============================================================
# Überblick über whole_graph.json + Kanten ohne Reaktanz
# (ohne direkte Verbindung zu einer Sammelschiene)
# ============================================================

import json
import pandas as pd
from src.config import GRAPH_PATH

try:
    from IPython.display import display
except ImportError:
    display = print

# -------------------------
# 1) JSON einlesen
# -------------------------
with open(GRAPH_PATH, "r", encoding="utf-8") as f:
    elems = json.load(f)

nodes = [e for e in elems if "data" in e and "source" not in e["data"]]
edges = [e for e in elems if "data" in e and "source" in e["data"]]

print("==== GROBER ÜBERBLICK ====")
print(f"Anzahl Knoten im JSON: {len(nodes)}")
print(f"Anzahl Kanten im JSON: {len(edges)}\n")

# -------------------------
# 2) Knotentypen auswerten
# -------------------------
node_info = []
for e in nodes:
    d = e["data"]
    node_info.append({
        "id": d["id"],
        "label": d.get("label", d["id"]),
        "type": d.get("type", None)
    })

df_nodes = pd.DataFrame(node_info)

print("Knotentypen im JSON:")
print(df_nodes["type"].value_counts())
print("\nBeispiel-Knoten:")
display(df_nodes.head(10))

# Mapping ID -> Typ
node_type_map = df_nodes.set_index("id")["type"].to_dict()

# -------------------------
# 3) Kanten grob anzeigen
# -------------------------
edge_info = []
for e in edges:
    d = e["data"]
    feats = d.get("features", {}) or {}
    edge_info.append({
        "id": d.get("id"),
        "label": d.get("label"),
        "source": d.get("source"),
        "target": d.get("target"),
        "source_type": node_type_map.get(d.get("source")),
        "target_type": node_type_map.get(d.get("target")),
        "has_X_total_ohm": feats.get("X_total_ohm") is not None,
        "has_X_ohm_per_km": feats.get("X_ohm_per_km") is not None,
        "strom_limit_a": feats.get("Strom_Limit_in_A"),
    })

df_edges = pd.DataFrame(edge_info)

print("\nBeispiel-Kanten:")
display(df_edges.head(20))

# -------------------------
# 4) Kanten ohne Reaktanz
#    UND nicht an Sammelschiene
# -------------------------

def has_reactance(row):
    return bool(row["has_X_total_ohm"] or row["has_X_ohm_per_km"])

def touches_busbar(row):
    return (row["source_type"] == "busbar") or (row["target_type"] == "busbar")

mask_no_x_and_no_bus = (~df_edges.apply(has_reactance, axis=1)) & (~df_edges.apply(touches_busbar, axis=1))

df_edges_problem = df_edges[mask_no_x_and_no_bus].copy()

print("\n==== KANTEN OHNE REAKTANZ (UND NICHT AN SAMMELSCHIENE) ====")
print(f"Anzahl solcher Kanten: {len(df_edges_problem)}")

if len(df_edges_problem) > 0:
    display(df_edges_problem[[
        "id", "label", "source", "source_type", "target", "target_type", "strom_limit_a"
    ]])
else:
    print("Keine Kanten gefunden, die die Kriterien erfüllen.")


==== GROBER ÜBERBLICK ====
Anzahl Knoten im JSON: 29
Anzahl Kanten im JSON: 29

Knotentypen im JSON:
type
uw_field    14
busbar       7
junction     6
battery      2
Name: count, dtype: int64

Beispiel-Knoten:


Unnamed: 0,id,label,type
0,WEDING,WEDING,busbar
1,TARP,TARP,busbar
2,JUBO,JUBO,busbar
3,SHUW,SHUW,busbar
4,BOLN,BOLN,busbar
5,SIES,SIES,busbar
6,SIEV,SIEV,busbar
7,SHUW_E24,SHUW_E24,uw_field
8,SHUW_E23,SHUW_E23,uw_field
9,JUBO_E01,JUBO_E01,uw_field



Beispiel-Kanten:


Unnamed: 0,id,label,source,target,source_type,target_type,has_X_total_ohm,has_X_ohm_per_km,strom_limit_a
0,SHUW_SS_E23,SHUW_SS_E23,SHUW,SHUW_E23,busbar,uw_field,False,False,
1,SHUW_SS_E24,SHUW_SS_E24,SHUW,SHUW_E24,busbar,uw_field,False,False,
2,JUBO_E01_JUBO,JUBO_E01_JUBO,JUBO,JUBO_E01,busbar,uw_field,False,False,
3,JUBO_E02_JUBO,JUBO_E02_JUBO,JUBO_E02,JUBO,uw_field,busbar,False,False,
4,JUBO_E03_JUBO,JUBO_E03_JUBO,JUBO_E03,JUBO,uw_field,busbar,False,False,
5,TARP_SS_TARP_E01,TARP_SS_TARP_E01,TARP_E01,TARP,uw_field,busbar,False,False,
6,TARP_SS_TARP_E03,TARP_SS_TARP_E03,TARP,TARP_E03,busbar,uw_field,False,False,
7,WEDING_SS_WEDING_E14,WEDING_SS_WEDING_E14,WEDING_E14,WEDING,uw_field,busbar,False,False,
8,WEDING_SS_WEDING_E12,WEDING_SS_WEDING_E12,WEDING_E12,WEDING,uw_field,busbar,False,False,
9,SIEV_SS_E01,SIEV_SS_E01,SIEV_E01,SIEV,uw_field,busbar,False,False,



==== KANTEN OHNE REAKTANZ (UND NICHT AN SAMMELSCHIENE) ====
Anzahl solcher Kanten: 0
Keine Kanten gefunden, die die Kriterien erfüllen.
