# 02 – Clustering Cities by EV Density

Objective: Cluster cities using EV density proxy and evaluate with silhouette score.

Inputs: `data/processed/city_ev_agg.csv`
Outputs: `data/processed/city_clustering.csv` (City, EV_Count_Total, EV_Density_Proxy, Cluster, Silhouette)


In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from pathlib import Path


In [3]:
# Resolve paths robustly
CWD = Path.cwd()
# Check if we're in notebooks/ directory or project root
if CWD.name == 'notebooks':
    # We're in notebooks/ directory, go up one level
    BASE_DIR = CWD.parent
else:
    # We're in project root
    BASE_DIR = CWD

IN_PATH = BASE_DIR / 'data/processed/city_ev_agg.csv'
OUT_PATH = BASE_DIR / 'data/processed/city_clustering.csv'

if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}")

print(f"Current directory: {CWD}")
print(f"Base directory: {BASE_DIR}")
print(f"Input path: {IN_PATH}")
print(f"Output path: {OUT_PATH}")
print(f"Input exists: {IN_PATH.exists()}")
print(f"Output directory exists: {OUT_PATH.parent.exists()}")


Current directory: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\notebooks
Base directory: e:\FDM\PROJECT\NEW\Smart-Charge-Locator
Input path: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\city_ev_agg.csv
Output path: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\city_clustering.csv
Input exists: True
Output directory exists: True


In [4]:
# Load and prepare data
city = pd.read_csv(IN_PATH)
city = city.dropna(subset=['EV_Density_Proxy'])
X = np.log1p(city[['EV_Density_Proxy']].values)

print(f"Data shape: {city.shape}")
print(f"Features shape: {X.shape}")
print("Sample data:")
city.head()


Data shape: (821, 4)
Features shape: (821, 1)
Sample data:


Unnamed: 0,City,EV_Count_Total,Unique_Zips,EV_Density_Proxy
0,Aberdeen,236,1,236.0
1,Aberdeen Proving Ground,1,1,1.0
2,Acme,11,1,11.0
3,Addy,6,1,6.0
4,Airway Heights,42,1,42.0


In [5]:
# Initialize best clustering results
best = {'method': None, 'labels': None, 'score': -1.0}

print("Testing KMeans clustering...")
# Try KMeans for k in 3..6
for k in range(3, 7):
    try:
        km = KMeans(n_clusters=k, n_init='auto', random_state=42)
        labels = km.fit_predict(X)
        score = silhouette_score(X, labels) if len(np.unique(labels)) > 1 else -1
        print(f"  KMeans k={k}: Silhouette = {score:.3f}")
        if score > best['score']:
            best = {'method': f'KMeans(k={k})', 'labels': labels, 'score': score}
    except Exception as e:
        print(f'  KMeans k={k} failed:', e)


Testing KMeans clustering...
  KMeans k=3: Silhouette = 0.721
  KMeans k=4: Silhouette = 0.690
  KMeans k=5: Silhouette = 0.671
  KMeans k=6: Silhouette = 0.717


In [6]:
print("Testing DBSCAN clustering...")
# Try DBSCAN with a few eps values
for eps in [0.1, 0.2, 0.3]:
    try:
        db = DBSCAN(eps=eps, min_samples=3)
        labels = db.fit_predict(X)
        if len(np.unique(labels)) > 1 and np.max(labels) >= 1:  # avoid all noise
            score = silhouette_score(X, labels)
            print(f"  DBSCAN eps={eps}: Silhouette = {score:.3f}")
            if score > best['score']:
                best = {'method': f'DBSCAN(eps={eps})', 'labels': labels, 'score': score}
        else:
            print(f"  DBSCAN eps={eps}: No valid clusters found")
    except Exception as e:
        print(f'  DBSCAN eps={eps} failed:', e)


Testing DBSCAN clustering...
  DBSCAN eps=0.1: Silhouette = 0.442
  DBSCAN eps=0.2: No valid clusters found
  DBSCAN eps=0.3: No valid clusters found


In [7]:
# Apply best clustering results
city['Cluster'] = best['labels'] if best['labels'] is not None else 0
city['Silhouette'] = best['score']
city['Method'] = best['method']

print(f"Best clustering method: {best['method']}")
print(f"Best silhouette score: {best['score']:.3f}")
print("Cluster distribution:")
print(city['Cluster'].value_counts().sort_index())


Best clustering method: KMeans(k=3)
Best silhouette score: 0.721
Cluster distribution:
Cluster
0    498
1    147
2    176
Name: count, dtype: int64


In [8]:
# Save results
city.to_csv(OUT_PATH, index=False)

print(f"Results saved to: {OUT_PATH}")
print(f"Final data shape: {city.shape}")
print("Sample clustered data:")
city[['City', 'EV_Density_Proxy', 'Cluster']].head(10)


Results saved to: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\city_clustering.csv
Final data shape: (821, 7)
Sample clustered data:


Unnamed: 0,City,EV_Density_Proxy,Cluster
0,Aberdeen,236.0,1
1,Aberdeen Proving Ground,1.0,0
2,Acme,11.0,2
3,Addy,6.0,0
4,Airway Heights,42.0,2
5,Alameda,3.0,0
6,Alderdale,1.0,0
7,Alexandria,1.5,0
8,Algona,48.0,2
9,Aliso Viejo,1.0,0
