In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

In [4]:
def compare_algorithms(X, max_clusters):
  results = []
  cluster_range = range(2, max_clusters + 1)

  for n_clusters in cluster_range:
    # KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    kmeans_clusters = kmeans.fit_predict(X)
    kmeans_silhouette_avg = silhouette_score(X, kmeans_clusters)

    results.append(("KMeans", n_clusters, kmeans_silhouette_avg))

    # AgglomerativeClustering
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative_clusters = agglomerative.fit_predict(X)
    agglomerative_silhouette_avg = silhouette_score(X, agglomerative_clusters)

    results.append(("Agglomerative", n_clusters, agglomerative_silhouette_avg))

  # DBSCAN
  eps_range = np.arange(0.1, 0.9, 0.1)
  for eps in eps_range:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    dbscan_clusters = dbscan.fit_predict(X)
    if len(set(dbscan_clusters)) > 1:
      dbscan_silhouette_avg = silhouette_score(X, dbscan_clusters)
      results.append(("DBSCAN", eps, dbscan_silhouette_avg))

  return results

In [5]:
iris = datasets.load_iris()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(iris.data)

results = compare_algorithms(scaled_data, 10)
results

[('KMeans', 2, 0.5817500491982808),
 ('Agglomerative', 2, 0.5770346019475988),
 ('KMeans', 3, 0.4630420362927048),
 ('Agglomerative', 3, 0.4466890410285909),
 ('KMeans', 4, 0.41511334907493763),
 ('Agglomerative', 4, 0.4006363159855973),
 ('KMeans', 5, 0.39124662510551134),
 ('Agglomerative', 5, 0.33058726295230545),
 ('KMeans', 6, 0.3284778205371097),
 ('Agglomerative', 6, 0.31485480100512825),
 ('KMeans', 7, 0.3276942451516975),
 ('Agglomerative', 7, 0.316969830299128),
 ('KMeans', 8, 0.3355790471626717),
 ('Agglomerative', 8, 0.310946529007258),
 ('KMeans', 9, 0.35257457165711953),
 ('Agglomerative', 9, 0.31143422475471655),
 ('KMeans', 10, 0.36140458542683984),
 ('Agglomerative', 10, 0.3161120375980681),
 ('DBSCAN', 0.30000000000000004, -0.1941947686285083),
 ('DBSCAN', 0.4, 0.027670001694674724),
 ('DBSCAN', 0.5, 0.35651648142700726),
 ('DBSCAN', 0.6, 0.4027617471797909),
 ('DBSCAN', 0.7000000000000001, 0.523400848673573),
 ('DBSCAN', 0.8, 0.5216965052515835)]

In [8]:
df = pd.DataFrame(results, columns=["groupper", "n_clusters", "score"])
df

Unnamed: 0,groupper,n_clusters,score
0,KMeans,2.0,0.58175
1,Agglomerative,2.0,0.577035
2,KMeans,3.0,0.463042
3,Agglomerative,3.0,0.446689
4,KMeans,4.0,0.415113
5,Agglomerative,4.0,0.400636
6,KMeans,5.0,0.391247
7,Agglomerative,5.0,0.330587
8,KMeans,6.0,0.328478
9,Agglomerative,6.0,0.314855


In [9]:
max_score_index = df['score'].idxmax()
print(df.loc[max_score_index])

groupper       KMeans
n_clusters        2.0
score         0.58175
Name: 0, dtype: object
