In [23]:
# Imports
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform
import sys
sys.path.append("..")

from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

from utils import get_model_checkpoint_filepaths

In [2]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/03-Oct-2020_15-00-00",
    model_name="word2vec_sgns",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [16]:
def dunn_index(X, labels):
    """
    TODO: Docs
    """
    unique_labels = np.unique(labels)
    diam = np.max([np.max(pdist(X[labels==lab])) for lab in unique_labels])
    sep = np.min([np.min(cdist(X[labels==unique_labels[i]], X[labels==unique_labels[j]])) 
                  for i in range(len(unique_labels)) for j in range(i)])
    return sep/diam

def sd_validity_index(X, labels):
    """
    TODO: Docs
    """
    unique_labels = np.unique(labels)
    scat = np.mean([np.linalg.norm(np.var(X[labels==lab], axis=0)) 
                    for lab in unique_labels]) / np.linalg.norm(np.var(X, axis=0))
    
    centers = np.array([np.mean(X[labels==lab], axis=0) for lab in unique_labels])
    center_dists = pdist(centers)
    dis = np.sum(1/np.sum(squareform(center_dists), axis = 0)) * np.max(center_dists) / np.min(center_dists)
    return scat + dis

In [35]:
def evaluate_cluster_methods(
    word_embeddings: np.ndarray,
    vocab_size: int,
    cluster_classes: list,
    cluster_metrics: list,
    cluster_numbers: list,
) -> None:
    """
    TODO: Docs
    """
    X = word_embeddings[:vocab_size]
    clusterer_results = {clusterer_name: {} for clusterer_name, _ in cluster_classes}
    for cluster_name, cluster_cls in cluster_classes:
        print(f"--- Evaluating {cluster_name}... ---")
        if cluster_cls is not HDBSCAN:
            
            clusterer_results[cluster_name]["cluster_numbers"] = {}
            #clusterer_results[cluster_name]["metric_scores"] = {}
            for k in ks:
                print(f"- Fitting and predicting for k={k}...")
                cls = cluster_cls(n_clusters=k)
                cluster_labels = cls.fit_predict(X)
                
                clusterer_results[cluster_name]["cluster_numbers"][str(k)] = {
                    "labels": cluster_labels,
                    "metrics": {}
                }
                
                for metric_name, metric_func in cluster_metrics:
                    metric_score = metric_func(X, cluster_labels)
                    print(f"{metric_name}: {metric_score:.3f}")
                    clusterer_results[cluster_name]["cluster_numbers"][str(k)]["metrics"][metric_name] = metric_score
        else:
            cls = cluster_cls()
            cluster_labels = cls.fit_predict(X)
            clusterer_results[cluster_name]["labels"] = cluster_labels
            
            clusterer_results[cluster_name]["metrics"] = {}
            for metric_name, metric_func in cluster_metrics:
                metric_score = metric_func(X, cluster_labels)
                print(f"{metric_name}: {metric_score:.3f}")
                clusterer_results[cluster_name]["metrics"][metric_name] = metric_score

In [36]:
cluster_classes = [
    ("K-means clustering", KMeans),
    ("Spectral clustering", SpectralClustering),
    ("Agglomerative clustering", AgglomerativeClustering),
    ("HDBSCAN", HDBSCAN)
]
cluster_metrics = [
    ("Average silhouette score", silhouette_score),
    ("Davies-Bouldin score", davies_bouldin_score),
    ("Dunn index", dunn_index),
    ("SD validity index", sd_validity_index)
]
max_cluster_num = 20
ks = list(range(2, max_cluster_num + 1))

In [37]:
# Perform evaluation
evaluate_cluster_methods(
    word_embeddings=last_embedding_weights,
    vocab_size=1000,
    cluster_classes=cluster_classes,
    cluster_metrics=cluster_metrics,
    cluster_numbers=ks
)

--- Evaluating K-means clustering... ---
- Fitting and predicting for k=2...
Average silhouette score: 0.075
Davies-Bouldin score: 3.951
Dunn index: 0.080
SD validity index: 2.172
- Fitting and predicting for k=3...
Average silhouette score: 0.064
Davies-Bouldin score: 3.307
Dunn index: 0.091
SD validity index: 2.054
- Fitting and predicting for k=4...
Average silhouette score: 0.019
Davies-Bouldin score: 3.158
Dunn index: 0.190
SD validity index: 1.881
- Fitting and predicting for k=5...
Average silhouette score: 0.028
Davies-Bouldin score: 3.282
Dunn index: 0.191
SD validity index: 1.923
- Fitting and predicting for k=6...
Average silhouette score: 0.036
Davies-Bouldin score: 3.165
Dunn index: 0.191
SD validity index: 1.964
- Fitting and predicting for k=7...
Average silhouette score: 0.033
Davies-Bouldin score: 3.128
Dunn index: 0.135
SD validity index: 2.156
- Fitting and predicting for k=8...
Average silhouette score: 0.032
Davies-Bouldin score: 3.091
Dunn index: 0.135
SD validity