In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

# Load the dataset
file_path = 'Energy_Usage_2010_20240405.csv'  # Update this with your actual file path
data = pd.read_csv(file_path)

# Selecting relevant features for clustering
features = data[['TOTAL KWH', 'TOTAL THERMS']].copy()

# Handling missing values by replacing them with the median of the respective column
imputer = SimpleImputer(strategy='median')
features_imputed = imputer.fit_transform(features)

# Scaling the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Use a sample of the dataset for the Elbow Method to speed up the process
sample_indices = np.random.choice(features_scaled.shape[0], size=5000, replace=False)
features_sample = features_scaled[sample_indices]

# Determining the optimal number of clusters using the Elbow Method
inertia = []
range_clusters = range(1, 11)  # Adjust based on your specific needs
for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_sample)
    inertia.append(kmeans.inertia_)

# Plotting the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range_clusters, inertia, marker='o')
plt.title('Elbow Method For Optimal k (Sample)')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.xticks(range_clusters)
plt.grid(True)
plt.show()

# Assuming you've determined an optimal k (e.g., k=3), apply K-Means to the entire dataset
optimal_k = 3  # Update this based on the Elbow curve result
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(features_scaled)

# Add cluster labels to the original data
data['Cluster'] = clusters

# Analyzing clusters (optional, example)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='TOTAL KWH', y='TOTAL THERMS', hue='Cluster', palette='viridis')
plt.title('Clusters of Energy Usage')
plt.show()
