We use the Faiss (Facebook AI Similarity Search) library to determine the best number of clusters. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import faiss

In [None]:
# Load the data from the uploaded files
train = pd.read_csv('merged_train.csv')
test = pd.read_csv('test_data.csv')

In [None]:
#horo = train['Horodate'] # We will need the index later for the submission file
train=train.drop(columns=['Horodate'])
test=test.drop(columns=['Horodate'])

In [None]:
# Transpose to cluster consumers instead of timestamps (if needed)
df = train.T
# Now rows = consumers, columns = time-series features

# Normalize data (StandardScaler ensures zero mean and unit variance)
scaler = StandardScaler()
X = scaler.fit_transform(df.values.astype(np.float32))

In [None]:
def compute_elbow(X, k_range):
    """Compute inertia for Elbow Method using FAISS (GPU) or sklearn (CPU)."""
    inertia_values = []
    
    for k in k_range:
        try:
            # FAISS GPU-based KMeans
            kmeans = faiss.Kmeans(d=X.shape[1], k=k, gpu=True)
            kmeans.train(X)
            D, _ = kmeans.index.search(X, 1)  # Compute distances
            inertia = np.sum(D)
        except:
            # Fallback to scikit-learn CPU KMeans
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(X)
            inertia = kmeans.inertia_
        
        inertia_values.append(inertia)

    return inertia_values

def compute_silhouette(X, k_range):
    """Compute silhouette scores for different k values."""
    silhouette_scores = []
    
    for k in k_range:
        if k == 1:
            silhouette_scores.append(-1)  # Undefined for k=1
            continue
        
        try:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(X)
        except:
            labels = np.zeros(X.shape[0])  # Default labels if clustering fails
            
        score = silhouette_score(X, labels)
        silhouette_scores.append(score)

    return silhouette_scores

In [None]:
# Define range of k values, we take (2, 50) as an example
k_values = range(2, 50)

# Compute Elbow Method
inertia_values = compute_elbow(X, k_values)

# Compute Silhouette Scores
silhouette_scores = compute_silhouette(X, k_values)

# Plot results
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Elbow Method Plot
ax[0].plot(k_values, inertia_values, marker='o', linestyle='-')
ax[0].set_title('Elbow Method for Electricity Consumers')
ax[0].set_xlabel('Number of Clusters (k)')
ax[0].set_ylabel('Inertia')
ax[0].grid()

# Silhouette Score Plot
ax[1].plot(k_values, silhouette_scores, marker='o', linestyle='-')
ax[1].set_title('Silhouette Analysis for Electricity Consumers')
ax[1].set_xlabel('Number of Clusters (k)')
ax[1].set_ylabel('Silhouette Score')
ax[1].grid()

plt.show()