In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import hdbscan

# -----------------------------
# Reload master dataset
# -----------------------------
master_df = pd.read_csv("master_pds_dataset.csv")

# -----------------------------
# Shop Level Aggregation
# -----------------------------
shop_df = master_df.groupby(
    ["distCode","shopNo","distName","latitude","longitude"]
)[
    ["utilization_ratio",
     "portability_ratio",
     "rice_wheat_ratio",
     "yearly_transaction_volatility"]
].mean().reset_index()

# -----------------------------
# Feature Engineering
# -----------------------------
shop_df["log_volatility"] = np.log1p(shop_df["yearly_transaction_volatility"])
shop_df["log_rice_wheat"] = np.log1p(shop_df["rice_wheat_ratio"])

features = [
    "utilization_ratio",
    "portability_ratio",
    "log_volatility",
    "log_rice_wheat"
]

X = shop_df[features].copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.median(), inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# KMeans
# -----------------------------
kmeans = KMeans(n_clusters=4, random_state=42, n_init=20)
shop_df["cluster"] = kmeans.fit_predict(X_scaled)

persona_map = {
    0: "Stable Rural Mainstream",
    1: "Urban Mobility-Driven",
    2: "Low-Variability Controlled",
    3: "High-Portability Transit Hubs"
}

shop_df["persona"] = shop_df["cluster"].map(persona_map)

# -----------------------------
# HDBSCAN
# -----------------------------
clusterer = hdbscan.HDBSCAN(min_cluster_size=100)
labels_hdb = clusterer.fit_predict(X_scaled)
shop_df["hdb_label"] = labels_hdb

# Simple HDB persona mapping
shop_df["hdb_persona"] = np.where(
    shop_df["hdb_label"] == -1,
    "Noise / Anomaly",
    "Core Cluster"
)

# -----------------------------
# Behavioral Intensity Index
# -----------------------------
shop_df["behavioral_intensity_index"] = (
    0.4 * shop_df["portability_ratio"] +
    0.3 * shop_df["utilization_ratio"] +
    0.3 * (shop_df["log_volatility"] / shop_df["log_volatility"].max())
)

# -----------------------------
# Keep Only Needed Columns
# -----------------------------
final_dashboard_df = shop_df[[
    "distCode",
    "shopNo",
    "distName",
    "latitude",
    "longitude",
    "utilization_ratio",
    "portability_ratio",
    "log_volatility",
    "log_rice_wheat",
    "persona",
    "hdb_persona",
    "behavioral_intensity_index"
]].copy()

# Reduce memory footprint
for col in final_dashboard_df.select_dtypes(include=["float64"]).columns:
    final_dashboard_df[col] = final_dashboard_df[col].astype("float32")

for col in final_dashboard_df.select_dtypes(include=["int64"]).columns:
    final_dashboard_df[col] = final_dashboard_df[col].astype("int32")

# Save smaller dataset
final_dashboard_df.to_csv("dashboard_dataset.csv", index=False)

print("Saved dashboard_dataset.csv")
print("Shape:", final_dashboard_df.shape)

Saved dashboard_dataset.csv
Shape: (17164, 12)
