## 0. Setting

In [1]:
from tqdm import tqdm
import glob
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import matplotlib.image as mimg
import geopandas as gpd
import pandas as pd
import os
import warnings
import networkx as nx
os.chdir(os.getcwd())

# --- Set current working directory ---
# Change the working directory to the project root
import os
if os.getcwd().endswith('notebooks'):
    os.chdir('..')


# 1) Specify the font path using absolute paths (adjust filename/path as needed)
font_regular = os.path.abspath("assets/malgun.ttf")     # Regular
font_bold    = os.path.abspath("assets/malgunbd.ttf")   # Bold

# 3) Register fonts with Matplotlib font manager
fm.fontManager.addfont(font_regular)
fm.fontManager.addfont(font_bold)
try:
    # Rebuild cache (not always necessary depending on version)
    fm._rebuild()  # Private method, but used here as a workaround
except Exception:
    pass

# 4) Safely get the family name from the font file and apply globally
fam = fm.FontProperties(fname=font_regular).get_name()  # Expected: 'Malgun Gothic'
plt.rcParams.update({
    "font.family": fam,
    "axes.unicode_minus": False,  # Prevent minus sign from being rendered incorrectly
})

ARROW_FILE = "assets/north_arrow.png"  
def add_north_arrow(ax, x, y, arrow_file, zoom=0.05):
    """Add a north arrow image to the plot."""
    im = mimg.imread(arrow_file)
    ax.add_artist(AnnotationBbox(OffsetImage(im, zoom=zoom), (x, y), 
                                 xycoords='axes fraction', frameon=False))

def add_scale_bar(ax, length, location=(0.1,0.02), linewidth=3, color='black'):
    """Add a scale bar to the plot."""
    xlim, ylim = ax.get_xlim(), ax.get_ylim()
    sb_x = xlim[0] + (xlim[1]-xlim[0])*location[0]
    sb_y = ylim[0] + (ylim[1]-ylim[0])*location[1]
    ax.plot([sb_x, sb_x+length], [sb_y, sb_y], color=color, linewidth=linewidth)
    ax.text(sb_x+length/2, sb_y, f'{round(length/1000):,} km', 
            va='bottom', ha='center', fontsize=10)


## 1. Data Preparation

In [2]:
# Get all matching CSV files
all_files = glob.glob("data/processed/deal_network/deal_by/network_by_*.csv")

# Create a dictionary to store DataFrames
nw_dict = {}

for file in all_files:
    # Extract key from file name, e.g., 'network_by_age (man).csv' → 'age (man)'
    key = os.path.basename(file).replace("network_by_", "").replace(".csv", "")
    
    # Read the CSV file
    nw = pd.read_csv(file, dtype={'14_시군구코드_buyer':str, '8_시군구코드_seller':str, '1_기준연도':int})
    
    nw = nw[~nw['14_시군구코드_buyer'].str.startswith('9999')&~nw['8_시군구코드_seller'].str.startswith('9999')]
    nw.columns = ['기준연도', '시군구코드_seller', '시군구코드_buyer', '거래관계']
    
    # Store DataFrame in dictionary
    nw_dict[key] = nw

desired_order = [
    'all',
    'man',
    'innovation',
    'urban_size_소상공인',
    'urban_size_중소기업',
    'urban_size_중견기업',
    'urban_size_대기업',
    'urban_age_1년 미만',
    'urban_age_1~5년 미만',
    'urban_age_5~10년 미만',
    'urban_age_10년 이상'
]

# 2. 새로운 딕셔너리를 만들어 순서를 적용합니다.
#    (혹시 모를 에러를 방지하기 위해 nw_dict에 실제 있는 키만 가져옵니다.)
ordered_nw_dict = {key: nw_dict[key] for key in desired_order if key in nw_dict}

# 3. 원래 nw_dict를 정렬된 딕셔너리로 교체합니다.
nw_dict = ordered_nw_dict

SGG_map = gpd.read_file('data/processed/map/SGG_map.gpkg').to_crs(epsg=4326)
SGG_map = SGG_map[['SIG_KOR_NM', 'SIG_CD', 'geometry']].copy()

# 표준화된 컬럼 생성
SGG_map['SIGUNGU_NM'] = SGG_map['SIG_KOR_NM']
SGG_map['SIGUNGU_CD'] = SGG_map['SIG_CD']
SGG_map['adm_nm']     = SGG_map['SIG_KOR_NM']

SGG_map = SGG_map[[ 'SIGUNGU_CD', 'SIGUNGU_NM', 'geometry']]
SGG_map.to_file('data/processed/map/SGG_map_zone.gpkg', driver='GPKG')

## 3. Network centrality calculation

In [3]:
SGG_map = gpd.read_file('data/processed/map/SGG_map_zone.gpkg').to_crs(epsg=4326)
SIDO_NM_map = {
    '11': '서울특별시', '26': '부산광역시', '27': '대구광역시', '28': '인천광역시',
    '29': '광주광역시', '30': '대전광역시', '31': '울산광역시', '36': '세종특별자치시',
    '41': '경기도', '51': '강원특별자치도', '43': '충청북도', '44': '충청남도',
    '52': '전북특별자치도', '46': '전라남도', '47': '경상북도', '48': '경상남도',
    '50': '제주특별자치도'
}

warnings.filterwarnings("ignore")

# ====== User-adjustable parameters ======
out_tbl_dir = "outputs/tables/centralities"
weights = ["거래관계"]   # Can be ["거래횟수", "거래관계", "거래액"], etc.
drop_nodes = ["9999"]   # Codes to be masked/removed
# ========================================

# Prepare output directory
os.makedirs(out_tbl_dir, exist_ok=True)

# ── Automatically detect code column in SGG_map (only once) ──
_code_candidates = ["SIGUNGU_CD", "SIG_CD", "ADM_CD", "code", "CODE"]
_code_col = next((c for c in _code_candidates if c in SGG_map.columns), None)
if _code_col is None:
    raise ValueError("No administrative code column found in SGG_map. (e.g., SIGUNGU_CD, SIG_CD)")

_all_zone_nodes = (
    SGG_map[_code_col].astype(str).str.zfill(5).unique().tolist()
)

# 2) Execution (procedural, no separate module/function)
_eps = 1e-9
_drop_set_orig = set(str(x) for x in (drop_nodes or []))
_drop_set_5 = set([s.zfill(5) for s in _drop_set_orig])
_drop_both = set(_drop_set_orig) | set(_drop_set_5)


for key, df in nw_dict.items():
    if not isinstance(df, pd.DataFrame) or df.empty:
        print(f"[WARN] '{key}' is empty — skipped")
        continue
    if "기준연도" not in df.columns:
        raise ValueError(f"[{key}] Missing '기준연도' column.")

    years = sorted(pd.unique(df["기준연도"].dropna()))
    print(f"▶ Network '{key}' — {len(years)} years, {len(df):,} raw edges")

    for w in weights:
        if w not in df.columns:
            print(f"  - [WARN] '{key}' has no weight column '{w}' — skipped")
            continue
        subdir = os.path.join(out_tbl_dir, key, w)
        os.makedirs(subdir, exist_ok=True)

        # ── Process per-year ─────────────────────────────────────────────
        for y in years:
            if os.path.exists(os.path.join(subdir, f"centrality_{y}.csv")):
                print(f"  - [{key}/{w}/{y}] Already processed — skipped")
                continue

            sub = df.loc[df["기준연도"] == y, ["시군구코드_seller", "시군구코드_buyer", w]].copy()
            if sub.empty:
                out_csv = os.path.join(subdir, f"centrality_{y}.csv")
                pd.DataFrame(columns=[
                    "node","in_degree","out_degree","in_strength","out_strength",
                    "in_degree_c","out_degree_c","closeness","betweenness","eigenvector","pagerank"
                ]).to_csv(out_csv, index=False, encoding="utf-8-sig")
                print(f"  - [{key}/{w}/{y}] Empty year — saved empty CSV")
                continue

            # Aggregate (seller, buyer) + remove self-loops
            sub[w] = pd.to_numeric(sub[w], errors="coerce").fillna(0.0).astype(float)
            sub = sub.dropna(subset=["시군구코드_seller", "시군구코드_buyer"]) 
            agg = (
                sub.groupby(["시군구코드_seller", "시군구코드_buyer"], as_index=False)[w]
                   .sum()
            )
            agg = agg[agg["시군구코드_seller"] != agg["시군구코드_buyer"]].reset_index(drop=True)

            # Build directed graph (DiGraph) + define length
            G = nx.DiGraph()
            for _, r in agg.iterrows():
                u = r["시군구코드_seller"]
                v = r["시군구코드_buyer"]
                weight_val = float(max(0.0, r[w]))
                if G.has_edge(u, v):
                    G[u][v]["weight"] += weight_val
                else:
                    G.add_edge(u, v, weight=weight_val)
                    
            for u, v, data in G.edges(data=True):
                ww = float(data.get("weight", 0.0))
                data["length"] = 1.0 / (ww + _eps) if ww > 0 else 1.0 / _eps
                
            # >>> INSERT A: Add nodes from SGG_map (include isolated nodes)
            nodes_to_add = [n for n in _all_zone_nodes if n not in _drop_both and n not in G]
            if nodes_to_add:
                G.add_nodes_from(nodes_to_add)

            nodes = list(G.nodes())


            # Compute centralities (same logic as module version)
            if len(nodes) == 0:
                central = pd.DataFrame(columns=[
                    "node","in_strength","out_strength","degree_strength",
                    "closeness","betweenness","eigenvector","pagerank"
                ])
            else:
                in_strength = dict(G.in_degree(weight="weight"))
                out_strength = dict(G.out_degree(weight="weight"))
                degree_strength = {u: in_strength.get(u, 0.0) + out_strength.get(u, 0.0) for u in nodes}

                closeness = nx.closeness_centrality(G, distance="length")
                betweenness = nx.betweenness_centrality(G, weight="length", normalized=True)
                eigen = nx.eigenvector_centrality(G, max_iter=5000, tol=1e-6, weight="weight")
                pagerank = nx.pagerank(G, alpha=0.85, weight="weight")


                central = pd.DataFrame({
                    "node": nodes,
                    "in_strength": [float(in_strength.get(u, 0.0)) for u in nodes],
                    "out_strength": [float(out_strength.get(u, 0.0)) for u in nodes],
                    "degree_strength": [float(degree_strength.get(u, 0.0)) for u in nodes],
                    "closeness": [float(closeness.get(u, 0.0)) for u in nodes],
                    "betweenness": [float(betweenness.get(u, 0.0)) for u in nodes],
                    "eigenvector": [float(eigen.get(u, 0.0)) for u in nodes],
                    "pagerank": [float(pagerank.get(u, 0.0)) for u in nodes],
                })
            # Apply drop-node filtering
            if not central.empty:
                central = central[~central["node"].astype(str).isin(_drop_set_orig) & ~central["node"].astype(str).isin(_drop_set_5)]
                
            central = pd.merge(
                central,
                SGG_map[['SIGUNGU_CD','SIGUNGU_NM']],
                left_on='node', right_on='SIGUNGU_CD', how='left'
            )
            central['SIDO_CD'] = central['node'].str[:2].str.zfill(2)
            central['SIDO_NM'] = central['SIDO_CD'].map(SIDO_NM_map)
            central = central.drop(columns=['SIGUNGU_CD', 'SIDO_CD'])

            out_csv = os.path.join(subdir, f"centrality_{y}.csv")
            central.to_csv(out_csv, index=False, encoding="utf-8-sig")


▶ Network 'all' — 7 years, 359,925 raw edges
  - [all/거래관계/2016] Already processed — skipped
  - [all/거래관계/2017] Already processed — skipped
  - [all/거래관계/2018] Already processed — skipped
  - [all/거래관계/2019] Already processed — skipped
  - [all/거래관계/2020] Already processed — skipped
  - [all/거래관계/2021] Already processed — skipped
  - [all/거래관계/2022] Already processed — skipped
▶ Network 'man' — 7 years, 284,976 raw edges
  - [man/거래관계/2016] Already processed — skipped
  - [man/거래관계/2017] Already processed — skipped
  - [man/거래관계/2018] Already processed — skipped
  - [man/거래관계/2019] Already processed — skipped
  - [man/거래관계/2020] Already processed — skipped
  - [man/거래관계/2021] Already processed — skipped
  - [man/거래관계/2022] Already processed — skipped
▶ Network 'innovation' — 7 years, 85,148 raw edges
  - [innovation/거래관계/2016] Already processed — skipped
  - [innovation/거래관계/2017] Already processed — skipped
  - [innovation/거래관계/2018] Already processed — skipped
  - [innovation/거래관계/2

# 3. network centrality map

In [4]:
# === User-defined parameters (edit only if needed) ===
BASE_DIR = "outputs/tables/centralities"   # Root directory for centrality CSV files
ORIGIN_DIR = "outputs/figures/centrality/centralities_origin"        # Output root for raw maps
QUANT_DIR  = "outputs/figures/centrality/centralities_percentile"    # Output root for percentile maps
CRS_PROJ = "EPSG:5179"                       # Recommended: meter-based projection (for scale accuracy)
NETS_FILTER    = None                        # Example: ["all_masked","innovation_masked"] (None = auto)
WEIGHTS_FILTER = None                        # Example: ["거래액"] (None = auto)
YEARS_FILTER   = None                        # Example: ["2020","2021"] or [2020,2021] (None = all)
METRICS_FILTER = None                        # None = default metrics + strength
CTPRVN = gpd.read_file("data/raw/bnd_sido_00_2024_2Q/bnd_sido_00_2024_2Q.shp").to_crs(epsg=5179)  # Provincial boundaries (optional)

# === Load and preprocess SGG_map ===
SGG_map_path = "data/processed/map/SGG_map_zone.gpkg" 
SGG_map = gpd.read_file(SGG_map_path)
SGG_map["SIGUNGU_CD"] = SGG_map["SIGUNGU_CD"].astype(str).str.zfill(5)
if CRS_PROJ:
    SGG_map = SGG_map.to_crs(CRS_PROJ)

# === Auto-detect network folders (apply filter if specified) ===
if NETS_FILTER is None:
    NETS = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
    NETS.sort()
else:
    NETS = NETS_FILTER

# === Main loop (procedural, no functions) ===
for net in NETS:
    net_dir = os.path.join(BASE_DIR, net)
    if not os.path.isdir(net_dir):
        continue

    # Auto-detect weight folders (apply filter if specified)
    if WEIGHTS_FILTER is None:
        W_LIST = [d for d in os.listdir(net_dir) if os.path.isdir(os.path.join(net_dir, d))]
        W_LIST.sort()
    else:
        W_LIST = WEIGHTS_FILTER

    for w in W_LIST:
        w_dir = os.path.join(net_dir, w)
        if not os.path.isdir(w_dir):
            continue

        # Build list of metrics to plot
        base_metrics = ["pagerank", "eigenvector", "in_strength", "out_strength", "degree_strength"]
        METRIC_LIST = METRICS_FILTER or (base_metrics)

        # Iterate over centrality_*.csv files
        for fname in sorted(os.listdir(w_dir)):
            if not fname.startswith("centrality_") or not fname.endswith(".csv"):
                continue

            year_label = fname.replace("centrality_", "").replace(".csv", "")
            # Apply year filter
            if YEARS_FILTER is not None and str(year_label) not in set(map(str, YEARS_FILTER)):
                continue

            csv_path = os.path.join(w_dir, fname)
            central = pd.read_csv(csv_path)
            if central.empty or "node" not in central.columns:
                continue
            central["node"] = central["node"].astype(str).str.zfill(5)

            # Join with SGG_map
            g = SGG_map.merge(central, left_on="SIGUNGU_CD", right_on="node", how="left")

            # Prepare output folders
            raw_out_dir = os.path.join(ORIGIN_DIR, net)     # ← Root for raw maps
            quant_out_dir = os.path.join(QUANT_DIR, net)    # ← Root for percentile maps
            os.makedirs(raw_out_dir, exist_ok=True)
            os.makedirs(quant_out_dir, exist_ok=True)

            for metric in METRIC_LIST:
                if metric not in g.columns:
                    continue

                safe_metric = metric.replace("/", "_").replace("\\", "_")
                raw_png   = os.path.join(raw_out_dir,   f"map_{safe_metric}_{year_label}.png")
                quant_png = os.path.join(quant_out_dir, f"map_{safe_metric}_{year_label}.png")

                # ---- RAW value map ----
                if os.path.exists(raw_png):
                    print(f"[OK] {raw_png} already exists")
                    continue

                fig, ax = plt.subplots(figsize=(10, 9), dpi=100)
                g.plot(
                    column=metric,
                    cmap="coolwarm",
                    linewidth=0.3,
                    edgecolor="white",
                    legend=True,
                    ax=ax,
                    missing_kwds={"color": "#f0f0f0", "label": "No data"},
                )
                CTPRVN.plot(ax=ax, color="none", edgecolor="black", linewidth=1)
                ax.set_title(f"[{net}] {w} — {metric} ({year_label})", fontsize=14)
                add_north_arrow(ax, x=0.92, y=0.92, arrow_file=ARROW_FILE, zoom=0.2)
                add_scale_bar(ax, length=100000, location=(0.1, 0.02))
                ax.axis("off")
                plt.savefig(raw_png, bbox_inches="tight")
                #plt.show()
                plt.close()

 
                # ---- Percentile map (fixed at 0–100) ----
                if os.path.exists(quant_png):
                    print(f"[OK] {quant_png} already exists")
                    continue

                # Compute percentiles while preserving NaN
                pct = g[metric].rank(pct=True) * 100
                fig, ax = plt.subplots(figsize=(10, 9), dpi=100)
                g.assign(_pct=pct).plot(
                    column="_pct",
                    cmap="coolwarm",
                    linewidth=0.3,
                    edgecolor="white",
                    legend=True,
                    legend_kwds={"label": f"{metric} percentile (%)", "orientation": "vertical"},
                    ax=ax,
                    missing_kwds={"color": "#f0f0f0", "label": "No data"},
                    vmin=0, vmax=100
                )
                CTPRVN.plot(ax=ax, color="none", edgecolor="black", linewidth=1)
                ax.set_title(f"[{net}] {w} — {metric} ({year_label})", fontsize=14)
                add_north_arrow(ax, x=0.92, y=0.92, arrow_file=ARROW_FILE, zoom=0.2)
                add_scale_bar(ax, length=100000, location=(0.1, 0.02))
                ax.axis("off")
                plt.savefig(quant_png, bbox_inches="tight")
                #plt.show()
                plt.close()

                print(f"[INFO] {net}/{w}/{year_label} - {metric} map saved")


[OK] outputs/figures/centrality/centralities_origin/all/map_pagerank_2016.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_eigenvector_2016.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_in_strength_2016.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_out_strength_2016.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_degree_strength_2016.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_pagerank_2017.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_eigenvector_2017.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_in_strength_2017.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_out_strength_2017.png already exists
[OK] outputs/figures/centrality/centralities_origin/all/map_degree_strength_2017.png already exists
[OK] outputs/figures/centrality/cent