In [None]:
# %% 03_energy_need_score_v2
# Energy Need Score + Electrification Recommendation (robust version)

import os
import shutil
from datetime import datetime
import numpy as np
import pandas as pd
import geopandas as gpd

print("🚀 Starting Phase 3 v2 — Energy Need + Recommendation")
BASE = os.path.abspath("..") if os.path.basename(os.getcwd()) == "notebooks" else os.path.abspath(".")
DATA_PROCESSED = os.path.join(BASE, "data_processed")
FRONTEND_DATA = os.path.join(BASE, "frontend", "public", "data")
os.makedirs(DATA_PROCESSED, exist_ok=True)
os.makedirs(FRONTEND_DATA, exist_ok=True)

# ----------------------------
# 1) Load clusters (centroids)
# ----------------------------
# Primary expected file produced in Phase 2:
candidates = [
    os.path.join(DATA_PROCESSED, "clusters.geojson"),
    os.path.join(DATA_PROCESSED, "road_clusters.geojson"),  # fallback name some runs produce
]

clusters_path = None
for p in candidates:
    if os.path.exists(p):
        clusters_path = p
        break

if clusters_path is None:
    raise FileNotFoundError(
        "Could not find clusters file. Expected one of:\n" + "\n".join(candidates)
    )

clusters = gpd.read_file(clusters_path)
print(f"✅ Loaded {len(clusters)} clusters from: {os.path.relpath(clusters_path)}")
print("Columns available:", list(clusters.columns))

# -------------------------------------------------------------
# 2) Identify usable columns (robust to naming differences)
# -------------------------------------------------------------
def pick_first(cols, df):
    for c in cols:
        if c in df.columns:
            return c
    return None

COL_ROAD_KM = pick_first(["total_road_km", "road_km", "roads_km", "total_roads_km"], clusters)
COL_POWER_KM = pick_first(["powerline_length_km", "power_km", "total_power_km"], clusters)
COL_GRID_DIST = pick_first(["distance_to_grid_km", "grid_dist_km", "grid_distance_km"], clusters)
COL_POP = pick_first(["population", "pop", "est_pop", "pop_est"], clusters)

print("\n🔎 Selected columns (None = not found):")
print("  roads_km:", COL_ROAD_KM)
print("  power_km:", COL_POWER_KM)
print("  grid_distance_km:", COL_GRID_DIST)
print("  population:", COL_POP)

# ---------------------------------
# 3) Normalization helper functions
# ---------------------------------
def normalize(series: pd.Series):
    """min-max normalize to 0..1 (safe for constant/empty)"""
    s = pd.to_numeric(series, errors="coerce")
    if s.isna().all():
        return pd.Series(np.zeros(len(series)), index=series.index, dtype=float)
    s = s.fillna(s.median())
    min_, max_ = s.min(), s.max()
    if np.isclose(min_, max_):
        return pd.Series(np.zeros(len(series)), index=series.index, dtype=float)
    return (s - min_) / (max_ - min_)

# ---------------------------------------------
# 4) Compute normalized signals (if available)
#    Convention (higher = more need):
#      - roads_norm:  + (more roads → more potential demand → higher need)
#      - power_norm:  1 - norm(power_km) (fewer lines → higher need)
#      - grid_norm:   + (farther from grid → higher need)
#      - pop_norm:    + (more people → higher need)
# ---------------------------------------------
signals = {}

if COL_ROAD_KM:
    signals["roads_norm"] = normalize(clusters[COL_ROAD_KM])
else:
    signals["roads_norm"] = pd.Series(np.zeros(len(clusters)), index=clusters.index, dtype=float)

if COL_POWER_KM:
    signals["power_norm"] = 1.0 - normalize(clusters[COL_POWER_KM])
else:
    signals["power_norm"] = pd.Series(np.zeros(len(clusters)), index=clusters.index, dtype=float)

if COL_GRID_DIST:
    signals["grid_norm"] = normalize(clusters[COL_GRID_DIST])
else:
    signals["grid_norm"] = pd.Series(np.zeros(len(clusters)), index=clusters.index, dtype=float)

if COL_POP:
    signals["pop_norm"] = normalize(clusters[COL_POP])
else:
    signals["pop_norm"] = pd.Series(np.zeros(len(clusters)), index=clusters.index, dtype=float)

for k, v in signals.items():
    clusters[k] = v

print("\n✅ Built normalized signals:")
print(clusters[["roads_norm", "power_norm", "grid_norm", "pop_norm"]].head())

# ---------------------------------------------
# 5) Score weights (edit here to tune behavior)
#    Weights sum doesn’t have to be 1; we’ll normalize to 0-100.
# ---------------------------------------------
W_ROADS = 0.35
W_POWER = 0.25
W_GRID  = 0.30
W_POP   = 0.10

w_sum = W_ROADS + W_POWER + W_GRID + W_POP
if w_sum <= 0:
    raise ValueError("All weights are zero; set at least one weight > 0.")

score01 = (
    W_ROADS * clusters["roads_norm"]
    + W_POWER * clusters["power_norm"]
    + W_GRID  * clusters["grid_norm"]
    + W_POP   * clusters["pop_norm"]
) / w_sum

clusters["Score"] = (score01 * 100).clip(0, 100)

# ---------------------------------------------
# 6) Electrification Recommendation (rules)
#    Uses available fields; falls back to Score thresholds if needed.
# ---------------------------------------------
def recommend(row):
    score = row["Score"]

    # Prefer rules that use explicit metrics when available
    grid_d = None if COL_GRID_DIST is None else row.get(COL_GRID_DIST)
    pop_v  = None if COL_POP is None       else row.get(COL_POP)

    # Reasonable heuristics:
    # - Close to grid and/or high grid coverage → Main Grid
    # - Moderate distance / moderate pop → Mini-grid
    # - Farther away or sparse infrastructure → Off-grid
    if grid_d is not None and not pd.isna(grid_d):
        # Use population if available to decide between main vs mini
        if grid_d <= 15:
            return "Main Grid"
        elif grid_d <= 40:
            if pop_v is not None and not pd.isna(pop_v):
                return "Mini-grid" if pop_v >= np.nanmedian(clusters[COL_POP]) else "Off-grid"
            else:
                return "Mini-grid"
        else:
            return "Off-grid"

    # Fallback purely on Score when grid distance missing
    if score >= 80:
        return "Off-grid"
    elif score >= 55:
        return "Mini-grid"
    else:
        return "Main Grid"

clusters["Recommendation"] = clusters.apply(recommend, axis=1)

# Ensure essential ID columns exist
if "cluster_id" not in clusters.columns:
    clusters["cluster_id"] = np.arange(len(clusters))

# ---------------------------------------------
# 7) Save outputs
# ---------------------------------------------
scored_path = os.path.join(DATA_PROCESSED, "clusters_scored_v2.geojson")
clusters[["cluster_id", "Score", "Recommendation", "geometry"]].to_file(scored_path, driver="GeoJSON")
print(f"\n💾 Saved: {os.path.relpath(scored_path)}")

scores_csv = os.path.join(DATA_PROCESSED, "scores.csv")
clusters[["cluster_id", "Score", "Recommendation"]].sort_values("Score", ascending=False).to_csv(scores_csv, index=False)
print(f"💾 Saved: {os.path.relpath(scores_csv)}")

# Optional: copy to frontend for the app
dest_frontend = os.path.join(FRONTEND_DATA, "clusters_scored.geojson")
try:
    shutil.copyfile(scored_path, dest_frontend)
    print(f"📤 Copied → {os.path.relpath(dest_frontend)} (for the React app)")
except Exception as e:
    print(f"⚠️ Could not copy to frontend data folder: {e}")

# ---------------------------------------------
# 8) Quick summary
# ---------------------------------------------
print("\n📊 Score Summary:")
print(clusters["Score"].describe())

print("\n🔥 Top 5 high-need clusters:")
print(clusters[["cluster_id", "Score", "Recommendation"]].sort_values("Score", ascending=False).head(5))

print(f"\n✅ Phase 3 v2 complete at {datetime.now():%Y-%m-%d %H:%M}")

🚀 Starting Phase 3 — Energy Need Scoring and Recommendation...
✅ Loaded 20 clusters.
Columns available in file:
['lon', 'lat', 'cluster_id', 'total_road_km', 'dist_to_power_km', 'pop_index', 'norm_road_km', 'norm_dist_power', 'norm_pop', 'energy_need_score', 'recommendation', 'geometry']
Detected → Road: total_road_km, Power: dist_to_power_km, Grid/Distance: dist_to_power_km
✅ Energy Need Score computed successfully.
💾 Files saved:
 - clusters_scored_v2.geojson
 - scores_v2.csv

📊 Summary of Energy Need Scores:
count    20.000000
mean     66.488928
std      14.419745
min      47.411648
25%      52.772961
50%      66.804120
75%      72.928607
max      99.633669
Name: Score, dtype: float64

🔥 Top 5 High-Need Clusters:
    cluster_id      Score                   Recommendation
16          16  99.633669             Connect to Main Grid
9            9  88.431609             Connect to Main Grid
3            3  84.556582             Connect to Main Grid
17          17  77.827624  Mini-grid (