## 0. Setting

In [None]:
# 1. Standard library
import os
import gc
import glob
import re
import textwrap
import sys
from collections import Counter
from typing import Dict, Set, List, Tuple

# 2. Third-party libraries
import geopandas as gpd
import networkx as nx
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment

# 3. Visualization libraries (Matplotlib)
import matplotlib.image as img
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
from matplotlib import font_manager as fm
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from matplotlib.patches import Patch, PathPatch, Rectangle
from matplotlib.path import Path
from matplotlib.colors import to_rgba


sys.path.append("/home/pauluhill/Projects/2025_Kor_transaction/src")
from network_analysis import NetworkAnalyzer


# --- Set current working directory ---
# If running from notebooks/, move up to the project root
if os.getcwd().endswith('notebooks'):
    os.chdir('..')

# --- Matplotlib Korean font setup ---
# 1) Specify font file paths (absolute paths)
font_regular = os.path.abspath("assets/malgun.ttf")   # Malgun Gothic (regular)
font_bold = os.path.abspath("assets/malgunbd.ttf")    # Malgun Gothic (bold)

fm.fontManager.addfont(font_regular)
fm.fontManager.addfont(font_bold)

# 3) Apply global font settings
fam = fm.FontProperties(fname=font_regular).get_name()  # e.g., 'Malgun Gothic'
plt.rcParams.update({
    "font.family": fam,
    "axes.unicode_minus": False,  # prevent minus sign rendering issues
})


## 1. Data Preparation

In [None]:
# Network Data Loading

# --- Get list of CSV files and sort them in desired order ---
all_files = glob.glob("data/processed/deal_network/deal_by/network_by_*.csv")
desired_order = [
    'all', 'man', 'innovation',
    'urban_size_소상공인', 'urban_size_중소기업', 'urban_size_중견기업', 'urban_size_대기업',
    'urban_age_1년 미만', 'urban_age_1~5년 미만', 'urban_age_5~10년 미만', 'urban_age_10년 이상'
]
# Reconstruct file list according to the desired order
file_order_map = {os.path.basename(f).replace("network_by_", "").replace(".csv", ""): f for f in all_files}
sorted_files = [file_order_map[key] for key in desired_order if key in file_order_map]

# --- Load each network data into a DataFrame and store in a dictionary ---
nw_dict = {}
print("\nStarting to load network data...")
for file in sorted_files:
    key = os.path.basename(file).replace("network_by_", "").replace(".csv", "")
    try:
        nw = pd.read_csv(
            file,
            dtype={'14_시군구코드_buyer': str, '8_시군구코드_seller': str, '1_기준연도': int}
        )
        # Remove rows with invalid codes ('9999')
        nw = nw[~nw['14_시군구코드_buyer'].str.startswith('9999') & ~nw['8_시군구코드_seller'].str.startswith('9999')].copy()
        nw.columns = ['기준연도', '시군구코드_seller', '시군구코드_buyer', '거래관계']
        nw_dict[key] = nw
        print(f" - {key} successfully loaded")
    except Exception as e:
        print(f"File loading error: {file}, error: {e}")

print("\nNetwork data loading and sorting completed.")


## 2. Common Utility Function Definitions

In [None]:
ARROW_FILE = 'assets/north_arrow.png'
def add_north_arrow(ax, x, y, arrow_file, zoom=0.05):
    """Add a north arrow to the map."""
    if os.path.exists(arrow_file):
        im = img.imread(arrow_file)
        ax.add_artist(AnnotationBbox(OffsetImage(im, zoom=zoom), (x, y), xycoords='axes fraction', frameon=False))

SCALE_LEN_M = 100_000
def add_scale_bar(ax, length, location=(0.1, 0.02), linewidth=3, color='black'):
    """Add a scale bar to the map."""
    xlim, ylim = ax.get_xlim(), ax.get_ylim()
    sb_x = xlim[0] + (xlim[1] - xlim[0]) * location[0]
    sb_y = ylim[0] + (ylim[1] - ylim[0]) * location[1]
    ax.plot([sb_x, sb_x + length], [sb_y, sb_y], color=color, linewidth=linewidth)
    ax.text(sb_x + length / 2, sb_y, f'{round(length / 1000):,} km', va='bottom', ha='center', fontsize=10)


MAX_ID = 230
NO_DATA = (0.92, 0.92, 0.92, 1.0)

def build_color_table(max_id=600):
    """Generate a color table for unique IDs."""
    pal = list(plt.get_cmap("Set1").colors) + \
        list(plt.get_cmap("Set2").colors) + \
        list(plt.get_cmap("Dark2").colors)
    return {cid: mcolors.to_rgba(pal[cid % len(pal)], 1.0) for cid in range(max_id)}

COLOR_OF = build_color_table(600)

def groups_from_series(s: pd.Series) -> Dict[int, Set[str]]:
    """Convert a Pandas Series into a dictionary {group_id: {node_set}}."""
    s = s.dropna().astype(int)
    return {int(lab): set(idxs) for lab, idxs in s.groupby(s).groups.items()}

def bump(mem: Dict[int, Counter], canon_id: int, nodes: Set[str], w: float = 1.0):
    """Update memory (community membership information) for a community ID."""
    cw = mem.setdefault(canon_id, Counter())
    for n in nodes:
        cw[n] += w

def decay_memory(mem: Dict[int, Counter], rho: float):
    """Decay memory weights over time (rho < 1)."""
    if rho >= 0.9999: return
    for c in list(mem.keys()):
        cw = mem[c]
        for k in list(cw.keys()):
            cw[k] *= rho
            if cw[k] < 1e-9:
                del cw[k]

def weighted_jaccard(A_nodes: Set[str], canon_counter: Counter) -> float:
    """Compute weighted Jaccard similarity."""
    if not A_nodes: return 0.0
    inter = sum(canon_counter.get(x, 0.0) for x in A_nodes)
    denom = len(A_nodes) + sum(canon_counter.values()) - inter
    return (inter / denom) if denom > 0 else 0.0

def prev_year_guard(A: Set[str], prev_series: pd.Series, cand_canon: int) -> Tuple[float, float, float]:
    """Compare with previous year's community to compute Precision, Recall, and Expansion ratios."""
    if prev_series is None or prev_series.empty: return 0.0, 0.0, float("inf")
    prev_s = prev_series.dropna().astype(int)
    mask = prev_s == cand_canon
    if mask.sum() == 0: return 0.0, 0.0, float("inf")
    B = set(prev_s.index[mask])
    inter = len(A & B)
    precision = inter / len(A) if A else 0.0
    recall = inter / len(B) if B else 0.0
    expansion = len(A) / len(B) if B else float("inf")
    return precision, recall, expansion

def safe_name(s: str) -> str:
    """Convert a string into a safe format for saving files."""
    return re.sub(r"[^0-9A-Za-z가-힣_.\-]+", "_", str(s))

def wrap_label(txt: str, width=12, max_lines=2) -> str:
    """Wrap long text into lines of specified width and maximum number of lines."""
    lines = textwrap.wrap(txt, width=width)
    if len(lines) > max_lines:
        lines = lines[:max_lines]
        lines[-1] = lines[-1][:max(0, width - 1)] + "…"
    return "\n".join(lines) if lines else ""


## 3. community detection

### 3.1. detection

In [None]:
# Analysis parameter setup
SGG_MAP_PATH = "data/processed/map/SGG_map_zone.gpkg"
COMMUNITY_METHODS = ['Infomap', "Leiden", "Louvain"]
WEIGHTS = ["거래관계"]
MIN_MODULE_NODES = 5  # Minimum community size (filter out if smaller)

# Execution loop
print("\n=== Step 4: Start community detection ===")
SGG_base = gpd.read_file(SGG_MAP_PATH)

for weight in WEIGHTS:
    for key, df in nw_dict.items():
        print(f"[{key}] {weight} community analysis started")
        summary_rows = []

        # Standardize edge data
        tmp = (
            df.rename(columns={
                "시군구코드_seller": "source", "시군구코드_buyer": "target", weight: "weights",
            })[["기준연도", "source", "target", "weights"]]
            .dropna(subset=["source", "target", "weights"])
        )
        tmp["source"] = tmp["source"].astype(str)
        tmp["target"] = tmp["target"].astype(str)
        tmp = tmp[tmp["source"] != '9999'].copy()
        tmp = tmp[tmp["target"] != '9999'].copy()
        tmp["weights"] = pd.to_numeric(tmp["weights"], errors="coerce").fillna(0)
        tmp = tmp.groupby(["기준연도", "source", "target"], as_index=False)["weights"].sum()
        converted = {int(y): d[["source", "target", "weights"]].copy() for y, d in tmp.groupby("기준연도")}

        # Build graphs and run community detection
        analyzer = NetworkAnalyzer()
        analyzer.polygon_files["unizone1"] = SGG_MAP_PATH
        graphs = analyzer.create_graphs_from_data(converted, include_self=True, thres=100)

        out_tbl_dir = f"data/processed/communities/{key}"
        out_map_dir = f"data/processed/map/communities/{key}"
        os.makedirs(out_tbl_dir, exist_ok=True)
        os.makedirs(out_map_dir, exist_ok=True)
        SGG_map_result = SGG_base.copy()

        for method in COMMUNITY_METHODS:
            results = analyzer.analyze_communities_all_years(graphs, method=method)
            for year in sorted(results.keys()):
                cdf = results.get(year)
                G = graphs[year]

                # Filter out communities smaller than the minimum size
                sizes = cdf["module_id"].value_counts()
                small_modules = sizes[sizes < MIN_MODULE_NODES].index.tolist()
                if small_modules:
                    cdf.loc[cdf['module_id'].isin(small_modules), 'module_id'] = np.nan

                # Merge results into GeoDataFrame
                SGG_map_result = SGG_map_result.merge(
                    cdf[["name", "module_id"]].rename(columns={"name": "SIGUNGU_CD", "module_id": f"{method}_{year}"}),
                    on="SIGUNGU_CD", how="left"
                )

                # Compute and add summary info
                valid_cdf = cdf.dropna(subset=['module_id'])
                if not valid_cdf.empty:
                    sizes = valid_cdf["module_id"].value_counts().sort_values(ascending=False)
                    hhi = float(((sizes / sizes.sum())**2).sum())
                    summary_rows.append({"dataset": key, "year": year, "method": method, "nodes": G.number_of_nodes(), "edges": G.number_of_edges(), "modules": int(valid_cdf["module_id"].nunique()), "largest_module_size": int(sizes.iloc[0]), "module_concentration(HHI)": round(hhi, 4)})
                else:
                    summary_rows.append({"dataset": key, "year": year, "method": method, "nodes": G.number_of_nodes(), "edges": G.number_of_edges(), "modules": 0, "largest_module_size": 0, "module_concentration(HHI)": 0.0})


        # Save results
        gpkg_path = f"{out_map_dir}/{weight}_SGG_map_zone_community.gpkg"
        SGG_map_result.to_file(gpkg_path, driver="GPKG")
        if summary_rows:
            pd.DataFrame(summary_rows).sort_values(["dataset", "method", "year"]).to_csv(f"{out_tbl_dir}/{weight}_summary.csv", index=False, encoding="utf-8-sig")
        print(f"[OK] Saved → {gpkg_path}")


### 3.2. Canonicalization

In [None]:
# Analysis parameter setup
COMMUNITY_METHODS = ['Infomap', "Leiden", "Louvain"]
WEIGHTS = ["거래관계"]
BASE_DIR = "data/processed/map/communities"
OUT_DIR  = "outputs/gpkg/communities"
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR, exist_ok=True)

# Memory/matching thresholds
MEM_BASE_THRESH = 0.3    # Minimum similarity for memory-based matching
MEM_REVIVE_THRESH = 0.3  # Minimum similarity to revive a past ID
RHO = 1.0                # Memory decay rate (1.0 = cumulative, 0.9 = 10% decay per year)

# Previous-year priority matching conditions (to avoid abnormal merges)
PREV_PREC_MIN = 0.55     # Precision: % of this year’s members that were in last year’s community
PREV_RECALL_MIN = 0.35   # Recall: % of last year’s members retained this year
EXPANSION_CAP_PREV = 2.0 # Expansion cap (this year size / last year size)

# Conditions for reviving past IDs
EXPANSION_CAP_REVIVE = 2.0  # Expansion cap (this year size / last seen size)

# Execution loop
print("\n=== Step 5: Start community ID canonicalization ===")
for key in os.listdir(BASE_DIR):
    sub = os.path.join(BASE_DIR, key)
    if not os.path.isdir(sub): 
        continue

    for weight in WEIGHTS:
        gpkg = os.path.join(sub, f"{weight}_SGG_map_zone_community.gpkg")
        if not os.path.exists(gpkg): 
            continue
        out_dir = os.path.join(OUT_DIR, key)
        os.makedirs(out_dir, exist_ok=True)
        out_gpkg = os.path.join(out_dir, f"{weight}_SGG_map_zone_community.gpkg")
        
        
        print(f"[{key}] Processing {weight}...")
        g = gpd.read_file(gpkg).set_index("SIGUNGU_CD", drop=False)

        for method in COMMUNITY_METHODS:
            cols = [c for c in g.columns if c.startswith(f"{method}_") and "_canon_" not in c]
            years = sorted({int(c.split("_")[-1]) for c in cols})
            if not years: 
                continue

            raw_series = {y: g[f"{method}_{y}"] for y in years}
            canon_parts, memory, last_seen_size = {}, {}, {}
            next_id = 0

            # Initialize with the first year
            y0 = years[0]
            g0 = groups_from_series(raw_series[y0])
            init_map = {r: i for i, r in enumerate(sorted(g0))}
            canon_parts[y0] = raw_series[y0].map(init_map)
            for r, cid in init_map.items():
                bump(memory, cid, g0[r])
                last_seen_size[cid] = len(g0[r])
            next_id = len(init_map)
            prev_canon_series = canon_parts[y0]

            # Sequential matching for subsequent years
            for y in years[1:]:
                decay_memory(memory, RHO)
                curr_groups = groups_from_series(raw_series[y])
                raws, canons = sorted(curr_groups.keys()), sorted(memory.keys())
                mapping, used_canons = {}, set()

                # Priority matching with previous year (greedy)
                prev_groups = groups_from_series(prev_canon_series)
                prev_cands = []
                for r in raws:
                    for c, B in prev_groups.items():
                        precision, recall, expansion = prev_year_guard(curr_groups[r], prev_canon_series, c)
                        if (precision >= PREV_PREC_MIN) and (recall >= PREV_RECALL_MIN) and (expansion <= EXPANSION_CAP_PREV):
                            f1 = (2 * precision * recall) / (precision + recall + 1e-12)
                            prev_cands.append((f1, len(curr_groups[r] & B), r, c))
                
                prev_cands.sort(key=lambda t: (t[0], t[1]), reverse=True)
                used_r = set()
                for f1, inter, r, c in prev_cands:
                    if (r in used_r) or (c in used_canons): 
                        continue
                    mapping[r] = c
                    used_r.add(r)
                    used_canons.add(c)

                # Unmatched communities: memory-based + Hungarian algorithm
                remaining_raws = [r for r in raws if r not in mapping]
                remaining_canons = [c for c in canons if c not in used_canons]
                if remaining_raws and remaining_canons:
                    S = np.array([[weighted_jaccard(curr_groups[r], memory[c]) for c in remaining_canons] for r in remaining_raws])
                    C_aug = np.hstack([1.0 - S, np.ones((len(remaining_raws), max(1, len(remaining_raws))))])
                    ri, cj = linear_sum_assignment(C_aug)
                    for i, j in zip(ri, cj):
                        r = remaining_raws[i]
                        if j < len(remaining_canons):
                            c = remaining_canons[j]
                            mem_score = S[i, j]
                            exp_rev = len(curr_groups[r]) / last_seen_size.get(c, 1)
                            if (mem_score >= MEM_BASE_THRESH) and (mem_score >= MEM_REVIVE_THRESH and exp_rev <= EXPANSION_CAP_REVIVE):
                                mapping[r] = c
                                used_canons.add(c)
                            else:
                                mapping[r] = next_id; next_id += 1
                        else:
                            mapping[r] = next_id; next_id += 1
                
                # Assign new IDs to any communities still unmatched
                for r in raws:
                    if r not in mapping:
                        mapping[r] = next_id; next_id += 1

                # Update memory and store results
                for r, cid in mapping.items():
                    nodes = curr_groups[r]
                    bump(memory, cid, nodes)
                    last_seen_size[cid] = len(nodes)
                canon_parts[y] = raw_series[y].map(mapping)
                prev_canon_series = canon_parts[y]

            # Add results as new columns in the GPKG file
            for y in years:
                g[f"{method}_canon_{y}"] = canon_parts.get(y)
        
        g = g.reset_index(drop=True)
        g = g[[c for c in g.columns if c in ["SIGUNGU_CD", "SIGUNGU_NM"] + [c for c in g.columns if "_canon_" in c]] + ['geometry']]

        g.to_file(out_gpkg, driver="GPKG", index=False)
        print(f"  [OK] Canonical labels saved → {out_gpkg}")

print("[DONE] Community ID canonicalization completed")


## 4. Community visualization

### 4.1. Community map

- original

In [None]:
# ===== Settings =====
BASE_DIR = "outputs/gpkg/communities"
OUT_DIR  = "outputs/figures/communities/map"
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR, exist_ok=True)
#COMMUNITY_METHODS = ["Infomap", "Infomap_igraph", "Louvain", "Leiden", "Leiden_igraph"]
COMMUNITY_METHODS = ['Infomap', "Leiden", "Louvain"]
WEIGHTS = ["거래관계"]

CTPRVN = gpd.read_file("data/raw/bnd_sido_00_2024_2Q/bnd_sido_00_2024_2Q.shp").to_crs(epsg=5179)  # Provincial boundaries (optional)

# ===== Execution =====
for key in nw_dict.keys():  # Run in the order of nw_dict
    sub = os.path.join(BASE_DIR, key)
    
    for weight in WEIGHTS:
        gpkg = os.path.join(sub, f"{weight}_SGG_map_zone_community.gpkg")


        g = gpd.read_file(gpkg)
        if "SIGUNGU_CD" not in g.columns:
            raise ValueError("Column 'SIGUNGU_CD' is required in GPKG.")
        g = g.set_index("SIGUNGU_CD", drop=False)  # Prevent type mismatch

        for method in COMMUNITY_METHODS:
            # Find {method}_canon_{year} columns
            years: List[int] = []
            pref = f"{method}_canon_"
            for c in g.columns:
                if c.startswith(pref):
                    try:
                        years.append(int(c.split("_")[-1]))
                    except:
                        pass
            years = sorted(set(years))
            if not years:
                continue

            for y in years:
                outdir = os.path.join(OUT_DIR, key)
                os.makedirs(outdir, exist_ok=True)
                outpath = os.path.join(outdir, f"{method}_{y}.png")

                if os.path.exists(outpath):
                    print(f"[SKIP] Already exists → {outpath}")
                    continue

                col = f"{method}_canon_{y}"
                if col not in g.columns:
                    continue

                # 1) Cast to integer + create color vector
                s = g[col].astype("Int64")
                colors = s.map(lambda v: COLOR_OF[int(v)] if pd.notna(v) else NO_DATA)

                # 2) Plot
                fig, ax = plt.subplots(figsize=(10, 9), dpi=100)
                g.to_crs(epsg=5179).plot(ax=ax, color=colors, edgecolor="black", linewidth=0.3)
                CTPRVN.plot(ax=ax, color="none", edgecolor="black", linewidth=1)
                # Use external utilities if available (ignored if not)
                add_north_arrow(ax, 0.82, 0.90, ARROW_FILE, zoom=0.18)
                add_scale_bar(ax, SCALE_LEN_M, (0.05, 0.03))
                ax.set_title(f"[{key}] {weight} — {method} (Y{y})")
                ax.axis("off")

                # 3) Fixed mapping legend (only IDs that appear in this year)
                used_ids = sorted(int(v) for v in s.dropna().unique())
                handles = [Patch(facecolor=COLOR_OF[i], edgecolor="k", label=f"M{i}") for i in used_ids]
                if handles:
                    ax.legend(handles=handles, title="Module (canonical)",
                              loc="lower right", frameon=True,
                              fontsize=8, title_fontsize=9, ncol=1)

                # 4) Save
                plt.tight_layout()
                fig.savefig(outpath, dpi=100)
                #plt.show()
                plt.close(fig)
                gc.collect()
                print(f"[OK] {outpath}")

print("[DONE] Fixed-color plotting complete")


 - Participation Coefficient

In [None]:
# file: step3_plot_maps_with_within_module_degree.py
# - MODIFIED: Calculate Within-Module Degree (z-score) for each region (SIGGU) and represent it with transparency (alpha).
# - MODIFIED: The higher the within-community centrality (z-score), the more opaque; the lower, the more transparent.
# - MODIFIED: Add legend for z-score-based role classification (Hub, Non-hub, Peripheral).
# - Existing feature: Keep the same color mapping ("ID->Color") across all years for the same canonical ID.

# ===== Settings =====
BASE_DIR = "outputs/gpkg/communities"
# [MODIFIED] Output folder for result images
OUT_DIR  = "outputs/figures/communities/map_zscore"
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR, exist_ok=True)
COMMUNITY_METHODS = ['Infomap', "Leiden", "Louvain"]
WEIGHTS = ["거래관계"]


CTPRVN = gpd.read_file("data/raw/bnd_sido_00_2024_2Q/bnd_sido_00_2024_2Q.shp").to_crs(epsg=5179)

# ===== Execution =====
os.makedirs(OUT_DIR, exist_ok=True)

for key in nw_dict.keys(): # Run in the order of nw_dict
    sub = os.path.join(BASE_DIR, key)
    if not os.path.isdir(sub):
        continue

    for weight in WEIGHTS:
        gpkg = os.path.join(sub, f"{weight}_SGG_map_zone_community.gpkg")
        if not os.path.exists(gpkg):
            continue

        g = gpd.read_file(gpkg)
        if "SIGUNGU_CD" not in g.columns:
            raise ValueError("GPKG must contain the 'SIGUNGU_CD' column.")
        g = g.set_index("SIGUNGU_CD", drop=False)

        for method in COMMUNITY_METHODS:
            years: List[int] = []
            pref = f"{method}_canon_"
            for c in g.columns:
                if c.startswith(pref):
                    try:
                        years.append(int(c.split("_")[-1]))
                    except:
                        pass
            years = sorted(set(years))
            if not years:
                continue

            for y in years:
                outdir = os.path.join(OUT_DIR, key)
                os.makedirs(outdir, exist_ok=True)
                outpath = os.path.join(outdir, f"{method}_{y}.png")
                
                if os.path.exists(outpath):
                    print(f"[SKIP] Already exists → {outpath}")
                    continue

                col = f"{method}_canon_{y}"
                if col not in g.columns:
                    continue

                # Compute Within-Module Degree (z-score)
                z_score_values = {}
                
                df_year = nw_dict[key]
                df_year = df_year[df_year['기준연도'] == y].copy()
                
                if not df_year.empty:
                    df_year.rename(columns={'거래관계': 'weights'}, inplace=True)
                    G = nx.from_pandas_edgelist(df_year, '시군구코드_seller', '시군구코드_buyer', ['weights'])
                    
                    communities_series = g[col].dropna()
                    node_to_comm = {str(idx): int(val) for idx, val in communities_series.items()}

                    # Group nodes by community
                    comm_to_nodes = {}
                    for node, comm in node_to_comm.items():
                        if comm not in comm_to_nodes:
                            comm_to_nodes[comm] = []
                        comm_to_nodes[comm].append(node)

                    # Compute sum of internal edge weights per node
                    within_module_degrees = {n: 0.0 for n in G.nodes()}
                    for n in G.nodes():
                        if n not in node_to_comm: continue
                        comm_n = node_to_comm[n]
                        for neighbor in G.neighbors(n):
                            if node_to_comm.get(neighbor) == comm_n:
                                w = G[n][neighbor].get('weights', 1)
                                within_module_degrees[n] += w
                    
                    # Compute community statistics (mean, std)
                    comm_stats = {}
                    for comm, nodes in comm_to_nodes.items():
                        degrees = [within_module_degrees[n] for n in nodes if n in G.nodes()]
                        if degrees:
                            mean_k = np.mean(degrees)
                            std_k = np.std(degrees)
                            comm_stats[comm] = (mean_k, std_k)
                    
                    # Compute z-score per node
                    for n in G.nodes():
                        if n not in node_to_comm: continue
                        comm_n = node_to_comm[n]
                        if comm_n in comm_stats:
                            mean_k, std_k = comm_stats[comm_n]
                            k_is = within_module_degrees[n]
                            z_score_values[n] = (k_is - mean_k) / std_k if std_k > 0 else 0.0

                # Generate color + transparency vector
                s = g[col].astype("Int64")
                
                rgba_colors = []
                for idx, row in g.iterrows():
                    cid = s.get(idx)
                    base_color_rgb = COLOR_OF.get(int(cid), NO_DATA[:3]) if pd.notna(cid) else NO_DATA[:3]
                    
                    z_score = z_score_values.get(str(idx), -1.0) # Default = Peripheral
                    
                    if z_score >= 2.5:
                        alpha = 1.0   # Hub (opaque)
                    elif z_score >= 1.0:
                        alpha = 0.6   # Non-hub
                    else:
                        alpha = 0.25  # Peripheral (transparent)
                    
                    rgba_colors.append(mcolors.to_rgba(base_color_rgb, alpha=alpha))

                # Plot
                fig, ax = plt.subplots(figsize=(10, 9), dpi=100)
                g.to_crs(epsg=5179).plot(ax=ax, color=rgba_colors, edgecolor="black", linewidth=0.3)
                CTPRVN.plot(ax=ax, color="none", edgecolor="black", linewidth=1)
                
                add_north_arrow(ax, 0.82, 0.90, ARROW_FILE, zoom=0.18)
                add_scale_bar(ax, SCALE_LEN_M, (0.05, 0.03))
                
                ax.set_title(f"[{key}] {weight} — {method} (Y{y})")
                ax.axis("off")

                # Legend modification
                used_ids = sorted(int(v) for v in s.dropna().unique())
                handles = [Patch(facecolor=COLOR_OF[i], edgecolor="k", label=f"M{i}") for i in used_ids]
                
                # Add z-score legend
                handles.extend([
                    Patch(facecolor='none', edgecolor='none', label=''), # Spacer
                    Patch(facecolor='gray', label='Within-Module Degree (Alpha)'),
                    Patch(facecolor='gray', edgecolor='k', alpha=1.0, label='Hub (z ≥ 2.5)'),
                    Patch(facecolor='gray', edgecolor='k', alpha=0.6, label='Non-hub (1.0 ≤ z < 2.5)'),
                    Patch(facecolor='gray', edgecolor='k', alpha=0.25, label='Peripheral (z < 1.0)'),
                ])

                if handles:
                    ax.legend(handles=handles, title="Module & Role",
                              loc="lower right", frameon=True,
                              fontsize=8, title_fontsize=9, ncol=1)

                # Save
                plt.tight_layout()
                fig.savefig(outpath, dpi=100)
                #plt.show()
                plt.close(fig)
                gc.collect()
                print(f"[OK] {outpath}")

print("[DONE] Fixed-color plotting with z-score complete")


### 4.2. Sankey diagram

In [None]:
# file: step4_plot_sankey_consistent_colors.py
# - Sankey/Alluvial diagram (canonical community preserved across years)
# - Node colors/palette: same COLOR_OF mapping as step3 (fixed mapping)
# - Data source: data/map/communities/{key}/{weight}_SGG_map_zone_community.gpkg
# - Link values: based on the number of regions (SIGGU). 
#   If weighted flows are preferred, adjust the flow calculation section.

import os, re, gc, textwrap
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, Patch, PathPatch
from matplotlib.path import Path

# ===== Settings =====
BASE_DIR   = "outputs/gpkg/communities"
OUT_DIR    = "outputs/figures/communities/sankey"
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR, exist_ok=True)

COMMUNITY_METHODS = ['Infomap', "Leiden", "Louvain"]
WEIGHTS    = ["거래관계"]  # Determines which GPKG to read (node composition is the same)
FIG_W, FIG_H = 18, 7.6

# Layout parameters
LEFT, RIGHT = 0.05, 0.97
TOP, BOTTOM = 0.90, 0.12
YEAR_LABEL_Y = 0.07
COL_GAP = 1.0       # Horizontal gap between years
NODE_W  = 0.55       # Node width
V_GAP   = 0.01      # Vertical gap between nodes
TEXT_K = 2          # Number of top cities to display per community
TEXT_WRAP = 12     # Characters per wrapped line
MAX_LINES = 2       # Maximum number of lines in node labels
FLOW_THRESH = 0.0   # Minimum link size (relative to total number of regions)

# ===== Utilities =====
def safe_name(s: str) -> str:
    """Convert a string to a safe filename (remove special characters)."""
    return re.sub(r"[^0-9A-Za-z가-힣_.\-]+", "_", str(s))

def wrap_label(txt: str, width=12, max_lines=2) -> str:
    """Wrap long text into multiple lines with optional truncation."""
    lines = textwrap.wrap(txt, width=width)
    if len(lines) > max_lines:
        lines = lines[:max_lines]
        if len(lines[-1]) >= width:
            lines[-1] = lines[-1][:max(0, width-1)] + "…"
        else:
            lines[-1] += "…"
    return "\n".join(lines) if lines else ""

# ===== Execution =====
for key in os.listdir(BASE_DIR):
    sub = os.path.join(BASE_DIR, key)
    if not os.path.isdir(sub):
        continue

    # ---- Load DII data (SIGGU-level) ----
    dii_path = f"outputs/tables/cities_DII_RSI/dii_{key}.csv"
    if os.path.exists(dii_path):
        dii_df = pd.read_csv(dii_path, encoding="cp949")
        dii_dict = dii_df.set_index("SIG_CD")["DII"].to_dict()
    else:
        print(f"[WARN] DII file not found for {key}, fallback to zero dict")
        dii_dict = {}

    for weight in WEIGHTS:
        gpkg = os.path.join(sub, f"{weight}_SGG_map_zone_community.gpkg")
        if not os.path.exists(gpkg):
            print(f"[SKIP] {gpkg} not found")
            continue

        gdf = gpd.read_file(gpkg)
        if "SIGUNGU_CD" not in gdf.columns:
            print(f"[SKIP] {key}/{weight}: SIGUNGU_CD not found")
            continue
        name_col = "SIGUNGU_NM" if "SIGUNGU_NM" in gdf.columns else None

        total_regions = max(1, gdf["SIGUNGU_CD"].dropna().astype(str).nunique())

        for method in COMMUNITY_METHODS:
            # canonical community assignments for each year
            canon_cols = sorted([c for c in gdf.columns if c.startswith(f"{method}_canon_")],
                                key=lambda c: int(c.split("_")[-1]) if c.split("_")[-1].isdigit() else 0)
            if len(canon_cols) < 2:
                print(f"[SKIP] {key}/{weight} {method}: need >= 2 canonical years")
                continue
            years = [int(c.split("_")[-1]) for c in canon_cols]
            nY = len(years)

            node_sizes = {}    # (year, module) -> number of regions
            node_codes = {}    # (year, module) -> [SIGGU codes]
            node_names = {}    # (year, module) -> [SIGGU names]
            year_modules = {y: [] for y in years}

            # --- collect community composition per year ---
            for y in years:
                col = f"{method}_canon_{y}"
                subdf = gdf[["SIGUNGU_CD", col] + ([name_col] if name_col else [])].dropna(subset=[col]).copy()
                subdf[col] = subdf[col].astype(int)
                for m, grp in subdf.groupby(col):
                    size = grp.shape[0]
                    node_sizes[(y, int(m))] = size
                    node_codes[(y, int(m))] = grp["SIGUNGU_CD"].astype(str).tolist()
                    node_names[(y, int(m))] = grp[name_col].astype(str).tolist() if name_col else grp["SIGUNGU_CD"].astype(str).tolist()
                    year_modules[y].append(int(m))

            for y in years:
                year_modules[y] = sorted(set(year_modules[y]))

            # --- compute flows between consecutive years ---
            flows = {}  # (y0, m0, y1, m1) -> number of shared regions
            for _, row in gdf.iterrows():
                for i in range(nY-1):
                    y0, y1 = years[i], years[i+1]
                    c0, c1 = f"{method}_canon_{y0}", f"{method}_canon_{y1}"
                    if pd.isna(row.get(c0)) or pd.isna(row.get(c1)):
                        continue
                    m0, m1 = int(row[c0]), int(row[c1])
                    k = (y0, m0, y1, m1)
                    flows[k] = flows.get(k, 0) + 1

            # apply flow threshold
            flows = {k: v for k, v in flows.items() if v / total_regions >= FLOW_THRESH}
            if not flows:
                print(f"[SKIP] {key}/{weight} {method}: no flows after threshold")
                continue

            # --- figure setup ---
            x_positions = {y: i * COL_GAP for i, y in enumerate(years)}
            x_min, x_max = -COL_GAP*0.5, (nY-1)*COL_GAP + COL_GAP*0.5

            fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=100)
            ax.set_xlim(x_min, x_max)
            ax.set_ylim(0, 1)
            ax.axis("off")
            fig.subplots_adjust(left=LEFT, right=RIGHT, top=TOP, bottom=BOTTOM)

            # --- compute node bounds (vertical stacking) ---
            node_bounds = {}  # (year, module) -> (y0, y1)
            for y in years:
                mods = year_modules[y]
                sizes = [node_sizes.get((y, m), 0) for m in mods]
                total_size_y = sum(sizes)
                H = 1.0 - (len(mods) - 1) * V_GAP
                scale = H / total_size_y if total_size_y > 0 else 0.0
                y_cursor = 0.0
                for m, sz in zip(mods, sizes):
                    h = sz * scale
                    y0 = y_cursor
                    y1 = y0 + h
                    node_bounds[(y, m)] = (y0, y1)
                    y_cursor = y1 + V_GAP

            # --- draw nodes and labels ---
            for (y, m), (y0, y1) in node_bounds.items():
                x = x_positions[y]
                face = COLOR_OF.get(int(m), NO_DATA)
                rect = Rectangle((x - NODE_W/2, y0), NODE_W, y1 - y0,
                                 facecolor=face, edgecolor="black", lw=0.6)
                ax.add_patch(rect)

                # Node label: top 2 city names by DII + remaining count
                codes = node_codes.get((y, m), [])
                names = node_names.get((y, m), [])
                paired = [(c, n) for c, n in zip(codes, names)]
                paired_sorted = sorted(paired, key=lambda x: dii_dict.get(str(x[0]), 0), reverse=True)
                top = [n for _, n in paired_sorted[:TEXT_K]]
                total_n = len(paired_sorted)
                label = "·".join(top) + "\n" + (f"+{total_n-2}" if total_n > len(top) else "")
                label_txt = wrap_label(label, width=TEXT_WRAP, max_lines=MAX_LINES)

                ax.text(x, (y0 + y1) / 2, label_txt, ha="center", va="center", fontsize=9)

            # --- draw flows ---
            used_src = {k: 0.0 for k in node_bounds.keys()}
            used_tgt = {k: 0.0 for k in node_bounds.keys()}

            for i in range(nY - 1):
                y0, y1 = years[i], years[i+1]
                keys = [k for k in flows.keys() if k[0] == y0 and k[2] == y1]
                keys.sort(key=lambda k: -flows[k])

                s_total = sum(node_sizes.get((y0, mm), 0) for mm in year_modules[y0])
                t_total = sum(node_sizes.get((y1, mm), 0) for mm in year_modules[y1])
                s_scale = (1.0 - (len(year_modules[y0]) - 1) * V_GAP) / s_total if s_total > 0 else 0.0
                t_scale = (1.0 - (len(year_modules[y1]) - 1) * V_GAP) / t_total if t_total > 0 else 0.0

                for (yy0, m0, yy1, m1) in keys:
                    v = flows[(yy0, m0, yy1, m1)]
                    s_y0, s_y1 = node_bounds[(yy0, m0)]
                    t_y0, t_y1 = node_bounds[(yy1, m1)]

                    s_off = used_src[(yy0, m0)]
                    t_off = used_tgt[(yy1, m1)]

                    s_a = s_y0 + s_off * s_scale
                    s_b = s_y0 + (s_off + v) * s_scale
                    t_a = t_y0 + t_off * t_scale
                    t_b = t_y0 + (t_off + v) * t_scale

                    used_src[(yy0, m0)] += v
                    used_tgt[(yy1, m1)] += v

                    x0 = x_positions[yy0] + NODE_W/2
                    x1 = x_positions[yy1] - NODE_W/2
                    dx = (x1 - x0)
                    ctrl = 0.5

                    path_data = [
                        (Path.MOVETO, (x0, s_a)),
                        (Path.CURVE4, (x0 + ctrl*dx, s_a)),
                        (Path.CURVE4, (x1 - ctrl*dx, t_a)),
                        (Path.CURVE4, (x1, t_a)),
                        (Path.LINETO, (x1, t_b)),
                        (Path.CURVE4, (x1 - ctrl*dx, t_b)),
                        (Path.CURVE4, (x0 + ctrl*dx, s_b)),
                        (Path.CURVE4, (x0, s_b)),
                        (Path.CLOSEPOLY, (x0, s_a)),
                    ]
                    codes, verts = zip(*path_data)
                    c0 = COLOR_OF.get(int(m0), (0.6, 0.6, 0.6))
                    patch = PathPatch(Path(verts, codes),
                                      facecolor=c0, edgecolor="none", alpha=0.35)
                    ax.add_patch(patch)

            # --- add year labels ---
            for y in years:
                n_mods = len(year_modules.get(y, []))
                label_txt = f"{y} (n = {n_mods})"
                ax.text(x_positions[y], YEAR_LABEL_Y - 0.15,
                        label_txt, ha="center", va="center",
                        fontsize=11, color="black")

            # --- legend (module IDs) ---
            used_ids = sorted({m for (y, m) in node_bounds.keys()})
            if used_ids:
                patches = [Patch(facecolor=COLOR_OF.get(int(i), NO_DATA), edgecolor="black", label=f"M{i}") 
                           for i in used_ids]
                leg = ax.legend(
                    handles=patches,
                    title="Module (canonical)",
                    loc="upper left",
                    bbox_to_anchor=(1.01, 1.0),
                    frameon=True,
                    fontsize=8,
                    title_fontsize=9,
                    ncol=1
                )

            # --- save figure ---
            out_dir = OUT_DIR
            os.makedirs(out_dir, exist_ok=True)
            out_png = os.path.join(out_dir, f"sankey_{key}_{weight}_{safe_name(method)}.png")
            plt.savefig(out_png, dpi=100, bbox_inches="tight")
            #plt.show()
            plt.close(fig)
            gc.collect()
            print(f"[OK] {out_png}")

print("[DONE] Sankey plotting complete")
