In [1]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

In [2]:
topics = """space
languages
geography
easter
cryptocurrency
chemistry
Internet
social networks
ancient Rome
ships
electricity
ancient world
geology
religion
travel
memes
education
vaccinations
computers
flu
google
browsers
games
easter eggs
search engines
English
ancient Greece
health
smartphones
psychology
psychotherapy""".split('\n')

topics

['space',
 'languages',
 'geography',
 'easter',
 'cryptocurrency',
 'chemistry',
 'Internet',
 'social networks',
 'ancient Rome',
 'ships',
 'electricity',
 'ancient world',
 'geology',
 'religion',
 'travel',
 'memes',
 'education',
 'vaccinations',
 'computers',
 'flu',
 'google',
 'browsers',
 'games',
 'easter eggs',
 'search engines',
 'English',
 'ancient Greece',
 'health',
 'smartphones',
 'psychology',
 'psychotherapy']

In [5]:
word2vec = Word2Vec.load('model/word2vec.bin')

In [6]:
topic_embeddings = []

for topic in topics:
    word_embeddings = []
    for word in topic.split(' '):
        # apparently, the word2vec model is caseless
        word_embeddings.append(word2vec.wv[word.lower()])
            
    topic_embedding = np.mean(word_embeddings, axis=0)
    topic_embeddings.append(topic_embedding)

topic_embeddings = np.array(topic_embeddings)

In [7]:
sum([[(np.zeros(2), np.ones(2))], [(np.ones(3), np.zeros(3))]], [])

[(array([0., 0.]), array([1., 1.])),
 (array([1., 1., 1.]), array([0., 0., 0.]))]

In [8]:
def dunn_index_max(x, labels, cluster_centers):
    max_distance_within_cluster = 1e-6
    for label in np.unique(labels):
        x_label = x[labels == label]
        n_elements = len(x_label)
        if n_elements < 2:
            continue
        x_times_x = [[(x_label[i], x_label[j]) for j in range(i + 1, n_elements)] for i in range(n_elements)]
        # flatten
        x_times_x = sum(x_times_x, [])
        distances = [np.linalg.norm(pair[0] - pair[1]) for pair in x_times_x]
        max_distance_label = np.amax(distances)
        if max_distance_label > max_distance_within_cluster:
            max_distance_within_cluster = max_distance_label
    
    n_clusters = len(cluster_centers)
    c_times_c = [[(cluster_centers[i], cluster_centers[j]) for j in range(i + 1, n_clusters)] for i in range(n_clusters)]
    # flatten
    c_times_c = sum(c_times_c, [])
    cluster_distances = [np.linalg.norm(pair[0] - pair[1]) for pair in c_times_c]
    min_distance_centers = np.amin(cluster_distances)
    
    return min_distance_centers / max_distance_within_cluster
    

In [10]:
best_centers = None
best_score = 0
for n_clusters in range(5, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(topic_embeddings)
    dunn_index = dunn_index_max(topic_embeddings, kmeans.labels_, kmeans.cluster_centers_)
    if dunn_index > best_score:
        best_score = dunn_index
        best_centers = kmeans.cluster_centers_
        
for i, center in enumerate(best_centers):
    nearest_neighbour_idx = np.argmin(np.linalg.norm(topic_embeddings - center, axis=1))
    print(i + 1, topics[nearest_neighbour_idx])

1 games
2 memes
3 ancient world
4 cryptocurrency
5 vaccinations
6 Internet
