In [None]:
# ===============================
# Notebook 04 — Clustering & Visualization
# Project Antyodaya (UIDAI)
# ===============================

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

pd.set_option("display.max_columns", None)


In [None]:
input_file = "../data/processed/rar_financial_impact.csv"
df = pd.read_csv(input_file)

print("Shape:", df.shape)
print(df.head())


In [None]:
features = df[[
    "RaR_score",
    "blocked_value_cr",
    "pincode_count"
]].fillna(0)


In [None]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [None]:
# DBSCAN Parameters tuned for district-level data
dbscan = DBSCAN(eps=0.8, min_samples=5)
df["cluster"] = dbscan.fit_predict(features_scaled)

print("Cluster distribution:")
print(df["cluster"].value_counts())


In [None]:
def label_cluster(x):
    if x == -1:
        return "Administrative Desert"
    elif x == 0:
        return "High Impact Zone"
    elif x == 1:
        return "Medium Impact Zone"
    else:
        return "Low Impact Zone"

df["cluster_label"] = df["cluster"].apply(label_cluster)

df["cluster_label"].value_counts()


In [None]:
output_file = "../data/processed/rar_final_dashboard.csv"
df.to_csv(output_file, index=False)

print("Saved Power BI-ready dataset to:", output_file)
print("Files now in processed folder:", os.listdir("../data/processed"))


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df["RaR_score"], df["blocked_value_cr"], alpha=0.6)
plt.xlabel("RaR Score (Risk)")
plt.ylabel("Blocked Benefit (₹ Crores)")
plt.title("District Risk vs Financial Impact")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10,6))
colors = df["cluster"].astype(str)

plt.scatter(
    df["RaR_score"],
    df["blocked_value_cr"],
    c=df["cluster"],
    cmap="tab10",
    alpha=0.7
)

plt.xlabel("RaR Score")
plt.ylabel("Blocked Benefit (₹ Crores)")
plt.title("DBSCAN Clustering of Districts")
plt.colorbar(label="Cluster ID")
plt.show()
