In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_olivetti_faces
from sklearn.metrics import silhouette_score
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
X, y = fetch_olivetti_faces(return_X_y=True)

In [None]:
plt.imshow(X[1,:].reshape(64,64), cmap='binary_r')

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

train_idxs, test_idxs = list(splitter.split(X, y))[0]

X_test, y_test = X[test_idxs,:], y[test_idxs]
X_train, y_train = X[train_idxs,:], y[train_idxs]

train_idxs, val_idxs = list(splitter.split(X_train, y_train))[0]

X_val, y_val = X_train[val_idxs,:], y_train[val_idxs]
X_train, y_train = X_train[train_idxs,:], y_train[train_idxs]

In [None]:
models = {}
scores = {}
for k in range(2,30):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X_train)
    
    models[k] = kmeans
    scores[k] = silhouette_score(X_train, kmeans.labels_)

In [None]:
plt.plot(scores.keys(), scores.values())

In [None]:
k_best = list(scores.keys())[np.argmin(list(scores.values()))]  # This is stupid
model_best = models[k_best]

In [None]:
n_samples_per_cluster = 3
fig, axs = plt.subplots(nrows=k_best, ncols=n_samples_per_cluster, figsize=(4*n_samples_per_cluster,4*k_best))

for k in range(model_best.n_clusters):
    X_in_cluster = X[y==k,:]
    
    n_samples_in_cluster = min(n_samples_per_cluster, X_in_cluster.shape[0])
    X_in_cluster_sample = X_in_cluster[:n_samples_in_cluster,:]
    
    for i in range(n_samples_in_cluster):
        axs[k,i].imshow(X_in_cluster_sample[i,:].reshape(64,64), cmap='binary_r')