# 1. Word Similarity Task

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr

In [5]:
def get_pearson_spearman(filename):
    # get dictionary of word:vector
    words_dict = {}
    with open(filename, "r") as f:
        for line in f:
            word = line.split()
            words_dict[word[0]] = word[1:] 
    # get similarities of every word pair in simlex
    simlex = []
    model_simlex = []
    with open("SimLex-999.txt", "r") as f:
        next(f)
        for line in f:
            line_list = line.split()
            word1 = line_list[0]
            word2 = line_list[1]
            if word1 in words_dict and  word2 in words_dict:
                simlex.append(float(line_list[3]))
                sim = cosine_similarity([words_dict[word1]], [words_dict[word2]])
                model_simlex.append(float(sim))
    # calculate pearson and spearman with simlex
    pearson_simlex = pearsonr(model_simlex, simlex)
    spearman_simlex = spearmanr(model_simlex, simlex)
    
    # get similarities of wordpairs in MEN
    MEN = []
    model_MEN = []
    with open("MEN_dataset_natural_form_full", "r") as f:
        for line in f:
            line_list = line.split()
            word1 = line_list[0]
            word2 = line_list[1]
            if word1 in words_dict and  word2 in words_dict:
                MEN.append(float(line_list[2]))
                sim = cosine_similarity([words_dict[word1]], [words_dict[word2]])
                model_MEN.append(float(sim))
    # calculate pearson and spearman with MEN
    pearson_MEN = pearsonr(model_MEN, MEN)
    spearman_MEN = spearmanr(model_MEN, MEN)
    return[pearson_simlex, spearman_simlex, pearson_MEN, spearman_MEN]

In [6]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("bow2.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.4284586618164498, 7.992996652236848e-46)
SpearmanrResult(correlation=0.41414576777339385, pvalue=1.226812869252797e-42)
MEN correlations
(0.6776982244699229, 0.0)
SpearmanrResult(correlation=0.699904755830819, pvalue=0.0)


In [7]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("bow5.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.37560059706687154, 8.607410572298394e-35)
SpearmanrResult(correlation=0.36739613669787896, pvalue=2.9775781067162087e-33)
MEN correlations
(0.708236248047157, 0.0)
SpearmanrResult(correlation=0.7231686561368845, pvalue=0.0)


In [8]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("deps.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.4619013442972357, 6.838935662413988e-54)
SpearmanrResult(correlation=0.44564093493303847, pvalue=7.414295711467646e-50)
MEN correlations
(0.5974016044666723, 1.0183783114142845e-289)
SpearmanrResult(correlation=0.6178227900207052, pvalue=2.37026036e-315)


# 2. Word Analogy Task

In [190]:
MODEL = 'deps'

In [191]:
from sklearn.preprocessing import normalize
import numpy as np
import time

#Load trained embeddings

words_dict = {}
inverse_words_dict = {}
with open('%s.words' % MODEL, "r") as f:
    for line in f:
        word = line.split()
        normalized = normalize([word[1:]]).flatten()
        words_dict[word[0]] = normalized

In [145]:
a = []
a_ = []
b = []
b_ = []
set_words = set(words_dict.keys())

with open('questions-words.txt', "r") as f:
        for line in f:
            curr = line.split()
            if(len(curr)==4):
                #if set(curr) <= set_words
                curr = [w.lower() for w in curr]
                if all(word in set_words for word in curr):
                    a.append(curr[0])
                    a_.append(curr[1])
                    b.append(curr[2])
                    b_.append(curr[3])
N = len(a)
a_vec = np.zeros(shape=[N, 300], dtype=np.float32)
a__vec = np.zeros(shape=[N, 300], dtype=np.float32)
b_vec = np.zeros(shape=[N, 300], dtype=np.float32)
b__vec_true = np.zeros(shape=[N, 300], dtype=np.float32)

for i in range(N):
    a_vec[i,:]=words_dict[a[i]]
    a__vec[i,:]=words_dict[a_[i]]
    b_vec[i,:]=words_dict[b[i]]
    b__vec_true[i,:]=words_dict[b_[i]]
    
    
#normalize
v_vec = np.subtract(a__vec, a_vec)
b__vec = b_vec + v_vec
b__vec = normalize(b__vec)

In [146]:
#Create query search matrix and index-to-word dictionaries for fast search in the ranking

indices = []
keys = []
values = []

i = 0
for k, v in words_dict.iteritems():
    keys.append(k) 
    values.append(v)
    indices.append(i)
    i+=1
    
search_space = np.asarray(values, dtype=np.float32)
search_words_to_idx = dict(zip(keys, indices))
search_idx_to_words = dict(zip(indices, keys))

In [148]:
#Split distcane computation in 2 parts due to memory contstraints (~10k queries each part)

start = time.time()
y = 1- np.dot(b__vec[:10000,:], np.transpose(search_space))
end = time.time()
print(end - start)

294.786999941


In [149]:
y.shape

(10000L, 174015L)

In [150]:
d1 = y.shape[0]
d2 = y.shape[1]

argmin_indices = np.zeros(shape=[d1, 2])
rank = np.zeros(shape=[d1], dtype=np.int32)

start = time.time()

for i in range(d1):
    if i%1000==0:
        print(i)
    sorted_args = list(np.argsort(y[i,:]))
    sorted_args.remove(search_words_to_idx[b[i]])

    for j in range(d2-1):
        if sorted_args[j] == search_words_to_idx[b_[i]]:
            rank[i] = j
            break
    argmin_indices[i,:] = sorted_args[0:2]
end = time.time()
np.save('%s_indices_first10k' % MODEL, argmin_indices)
np.save('%s_rank_first10k' % MODEL, rank)
print(end - start)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
378.212000132


In [151]:
start = time.time()
y = 1- np.dot(b__vec[10000:,:], np.transpose(search_space.astype(np.float32)))
end = time.time()
print(end - start)

172.269999981


In [152]:
y.shape

(9208L, 174015L)

In [153]:
d1 = y.shape[0]
d2 = y.shape[1]

argmin_indices = np.zeros(shape=[d1, 2])
rank = np.zeros(shape=[d1], dtype=np.int32)

start = time.time()

for i in range(d1):
    if i%1000==0:
        print(i)
    sorted_args = list(np.argsort(y[i,:]))
    sorted_args.remove(search_words_to_idx[b[i]])
    for j in range(d2-1):
        if sorted_args[j] == search_words_to_idx[b_[i+10000]]:
            rank[i] = j
            #print(j)
            break
    argmin_indices[i,:] = sorted_args[0:2]
end = time.time()
np.save('%s_indices_last10k' % MODEL, argmin_indices)
np.save('%s_rank_last10k' % MODEL, rank)
print(end - start)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
488.600000143


In [154]:
#Make results file 

first_idx = np.load('%s_indices_first10k.npy' % MODEL)
last_idx = np.load('%s_indices_last10k.npy' % MODEL)
indices = np.vstack([first_idx, last_idx])

output = []

f = open('%s_analogies_predictions' % MODEL, 'w')

for i in range(indices.shape[0]):
    #print(int(indices[i,0]), search_idx_to_words[indices[i,0]])
    #print(search_space[])
    found = search_idx_to_words[int(indices[i,0])]
    #####UNCOMMENT/COMMENT THIS DEPENDING ON TASK (include or exclude b word)
    if(found == b[i]):
        found = search_idx_to_words[indices[i,1]]
    f.write("%s %s %s %s %s\n" % (a[i], a_[i], b[i], b_[i], found))

In [194]:
#Save ranking indices in one file
rank_first10k = np.load('%s_rank_first10k.npy' % MODEL)
rank_last10k = np.load('%s_rank_last10k.npy' % MODEL)
rank = np.concatenate([rank_first10k, rank_last10k])
np.save('%s_rank.npy' % MODEL, rank)

#### Accuracy

In [186]:
def compute_accuracy(filename):
    gt, pred = [],[]
    with open(filename, "r") as f:
        for line in f:
            gt.append(line.split()[-2])
            pred.append(line.split()[-1])
    pred = np.asarray(pred)
    gt = np.asarray(gt)
    return sum(pred==gt)/float(gt.shape[0])

print('Accuracy - bow2 (incl. word origin): %s' % compute_accuracy('bow2_analogies_predictions_'))
print('Accuracy - bow2 (excl. word origin): %s' % compute_accuracy('bow2_analogies_predictions')) 
print('Accuracy - bow5 (incl. word origin): %s' % compute_accuracy('bow5_analogies_predictions_'))
print('Accuracy - bow5 (excl. word origin): %s' % compute_accuracy('bow5_analogies_predictions'))
print('Accuracy - deps (incl. word origin): %s' % compute_accuracy('deps_analogies_predictions_'))
print('Accuracy - deps (excl. word origin): %s' % compute_accuracy('deps_analogies_predictions'))

Accuracy - bow2 (incl. word origin): 0.08869041437324748
Accuracy - bow2 (excl. word origin): 0.4403884100114238
Accuracy - bow5 (incl. word origin): 0.10338560598192958
Accuracy - bow5 (excl. word origin): 0.5510956485616367
Accuracy - deps (incl. word origin): 0.026863806747188673
Accuracy - deps (excl. word origin): 0.27925864223240315


#### MRR

In [195]:
def compute_MRR(filename):
    rank = np.load(filename)
    return np.sum(1/(rank+1.0))/len(rank)
print('MRR - bow2 (incl. word origin): %s' % compute_MRR('bow2_rank_.npy'))
print('MRR - bow2 (excl. word origin): %s' % compute_MRR('bow2_rank.npy'))
print('MRR - bow5 (incl. word origin): %s' % compute_MRR('bow5_rank_.npy'))
print('MRR - bow5 (excl. word origin): %s' % compute_MRR('bow5_rank.npy'))
print('MRR - deps (incl. word origin): %s' % compute_MRR('deps_rank_.npy'))
print('MRR - deps (excl. word origin): %s' % compute_MRR('deps_rank.npy'))

MRR - bow2 (incl. word origin): 0.3729270407292133
MRR - bow2 (excl. word origin): 0.46102701566772697
MRR - bow5 (incl. word origin): 0.41304149587365585
MRR - bow5 (excl. word origin): 0.5443513025785637
MRR - deps (incl. word origin): 0.24132450803395314
MRR - deps (excl. word origin): 0.2667586110632994


# 3. Clustering Word Vectors

In [1]:
import string
from itertools import compress

def filter_cluster_embeddings(model_file):

    #Load 2000 nouns
    with open("2000_nouns_sorted.txt", "r") as f:
        nouns = f.read().splitlines()

    nouns = [noun.strip() for noun in nouns]
    
    #Filter punctuation from noun words   
    new_nouns = []
    for noun in nouns:
        for c in string.punctuation:
            noun=noun.replace(c,"")
        new_nouns.append(noun)
    nouns = new_nouns

    #Load trained embeddings
    words_dict = {}
    with open(model_file, "r") as f:
        for line in f:
            word = line.split()
            words_dict[word[0]] = word[1:] 
            
    #Filter 2000 words
    words_dict_2000 = { noun: words_dict[noun] for noun in nouns}
    
    #Get values by sorted keys    
    words = sorted(words_dict_2000.keys())
    embeddings = [words_dict_2000[key] for key in sorted(words_dict_2000.keys())]
    
    return words, embeddings

In [2]:
words_bow2, embeddings_bow2 = filter_cluster_embeddings("bow2.words")

In [3]:
words_bow5, embeddings_bow5 = filter_cluster_embeddings("bow5.words")

In [4]:
words_deps, embeddings_deps = filter_cluster_embeddings("deps.words")

### Visualization

In [5]:
# PCA
from sklearn.decomposition import PCA

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import d3
from bokeh.io import output_notebook
output_notebook()

def pca_visualization(num_components, embeddings, words, title):
    pca = PCA(n_components=num_components)
    pca_result = pca.fit_transform(embeddings)
    print('Variation per principal component: {}'.format(pca.explained_variance_ratio_))
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title=title)
    source = ColumnDataSource(data=dict(x1=pca_result[:,0],
                                        x2=pca_result[:,1],
                                        names=words))
    
    p.scatter(x="x1", y="x2", size=8, source=source)
    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    show(p)

In [6]:
pca_visualization(2, embeddings_bow2, words_bow2, "PCA-bow2")
pca_visualization(2, embeddings_bow5, words_bow5, "PCA-bow5")
pca_visualization(2, embeddings_deps, words_deps, "PCA-deps")

Variation per principal component: [0.03162729 0.02683353]


Variation per principal component: [0.04026231 0.03309658]


Variation per principal component: [0.03549472 0.03201278]


In [7]:
#t-SNE
from sklearn.manifold import TSNE

def tsne_visualization(words, embeddings, p, num_iter, title):
    tsne = TSNE(n_components=2, verbose=1, perplexity=p, n_iter=num_iter)
    tsne_results = tsne.fit_transform(embeddings)
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title=title)
    source = ColumnDataSource(data=dict(x1=tsne_results[:,0],
                                        x2=tsne_results[:,1],
                                        names=words))
    
    p.scatter(x="x1", y="x2", size=8, source=source)
    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    show(p)

In [8]:
tsne_visualization(words_bow2, embeddings_bow2, 10, 2000, "t-SNE bow2")
tsne_visualization(words_bow5, embeddings_bow5, 10, 2000, "t-SNE bow5")
tsne_visualization(words_deps, embeddings_deps, 10, 2000, "t-SNE deps")

[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.023s...
[t-SNE] Computed neighbors for 1996 samples in 3.184s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.268582
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.496704
[t-SNE] Error after 2000 iterations: 1.895542


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.021s...
[t-SNE] Computed neighbors for 1996 samples in 3.267s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.291273
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.339279
[t-SNE] Error after 2000 iterations: 1.781440


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.024s...
[t-SNE] Computed neighbors for 1996 samples in 3.009s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.264414
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.925812
[t-SNE] Error after 2000 iterations: 1.695150


### KMeans

In [45]:
# from sklearn.cluster import KMeans


# def kmeans_clustering(words, embeddings, num_clusters):
#     kmeans_model = KMeans(n_clusters=num_clusters, random_state=0)
#     kmeans_fit = kmeans_model.fit(embeddings)
#     kmeans_labels = kmeans_fit.labels_
#     for i in range(num_clusters):
#         print('Cluster %s: %s' % (i, list(compress(words, kmeans_labels==i))))

In [78]:
#kmeans_clustering(words_bow2, embeddings_bow2, 20)

In [79]:
#kmeans_clustering(words_bow5, embeddings_bow5, 20)

In [80]:
#kmeans_clustering(words_deps, embeddings_deps, 20)

### Agglomerative Clustering

In [5]:
from sklearn.cluster import AgglomerativeClustering


def agglomerative_clustering(words, embeddings, num_clusters):
    agglomerative_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    agglomerative_fit = agglomerative_model.fit(embeddings)
    agglomerative_labels = agglomerative_fit.labels_
    for i in range(num_clusters):
        print('Cluster %s: %s' % (i, list(compress(words, agglomerative_labels==i))))

In [9]:
agglomerative_clustering(words_bow2, embeddings_bow2, 100)

Cluster 0: ['april', 'autumn', 'february', 'june', 'march', 'spring', 'summer', 'winter']
Cluster 1: ['advertisement', 'advertising', 'cable', 'channel', 'insider', 'interview', 'journal', 'magazine', 'media', 'network', 'news', 'newspaper', 'paper', 'papers', 'press', 'print', 'publication', 'publicity', 'radio', 'television']
Cluster 2: ['arrow', 'ball', 'barrel', 'basket', 'blade', 'blanket', 'bottle', 'brush', 'bucket', 'bullet', 'candle', 'chain', 'cigarette', 'fist', 'flame', 'gear', 'glove', 'grass', 'grip', 'heel', 'hole', 'hook', 'kick', 'knife', 'knot', 'lamp', 'lock', 'mask', 'needle', 'pillow', 'pint', 'pipe', 'pitch', 'punch', 'ring', 'rope', 'shaft', 'shot', 'stair', 'stick', 'stitch', 'strand', 'sword', 'torch', 'towel', 'trap', 'tray', 'trick', 'tube', 'wheel', 'wire']
Cluster 3: ['angle', 'branch', 'bridge', 'canal', 'corridor', 'curve', 'direction', 'distance', 'flow', 'line', 'pass', 'passage', 'path', 'rail', 'railway', 'river', 'road', 'route', 'slope', 'stream', '

In [10]:
agglomerative_clustering(words_bow5, embeddings_bow5, 100)

Cluster 0: ['architecture', 'audience', 'beauty', 'crowd', 'design', 'drawing', 'fashion', 'glimpse', 'insight', 'manner', 'mirror', 'observation', 'observer', 'painting', 'perspective', 'reader', 'reflection', 'style', 'tone', 'view', 'viewer']
Cluster 1: ['accountant', 'adviser', 'analyst', 'assistant', 'auditor', 'chair', 'chairman', 'chancellor', 'chief', 'clerk', 'coach', 'commentator', 'commissioner', 'constable', 'consultant', 'deputy', 'director', 'executive', 'governor', 'head', 'inspector', 'leader', 'manager', 'member', 'organiser', 'participant', 'president', 'secretary', 'solicitor', 'spokesman', 'supporter', 'trustee']
Cluster 2: ['admission', 'certificate', 'clinic', 'curriculum', 'education', 'examination', 'expert', 'hospital', 'inspection', 'instruction', 'leadership', 'medicine', 'membership', 'nurse', 'practice', 'practitioner', 'preparation', 'profession', 'qualification', 'specialist', 'surgery', 'teaching', 'training', 'youth']
Cluster 3: ['adam', 'benjamin', 'bi

In [11]:
agglomerative_clustering(words_deps, embeddings_deps, 100)

Cluster 0: ['agenda', 'approach', 'attention', 'attitude', 'background', 'bias', 'concern', 'coverage', 'criticism', 'effect', 'emphasis', 'experience', 'expertise', 'exposure', 'focus', 'impact', 'influence', 'insight', 'knowledge', 'news', 'perspective', 'popularity', 'priority', 'publicity', 'reception', 'reputation', 'stance', 'success', 'view']
Cluster 1: ['allowance', 'budget', 'cash', 'coin', 'compensation', 'credit', 'currency', 'debt', 'dividend', 'earnings', 'expenditure', 'fortune', 'gold', 'income', 'insurance', 'investment', 'loan', 'money', 'mortgage', 'payment', 'pension', 'premium', 'price', 'rent', 'revenue', 'salary', 'savings', 'share', 'spending', 'stake', 'stock', 'subsidy', 'wages', 'wealth']
Cluster 2: ['account', 'address', 'balance', 'cast', 'cause', 'charge', 'chart', 'command', 'control', 'cook', 'copy', 'count', 'counter', 'cross', 'date', 'deal', 'deposit', 'display', 'document', 'draft', 'exercise', 'file', 'filter', 'flag', 'grave', 'grip', 'handle', 'hol

### Density-based Clustering (e.g: DBSCAN)

In [55]:
from sklearn.cluster import DBSCAN

def dbscan_clustering(words, embeddings, min_samples, epsilon):
    dbscan_model = DBSCAN(eps=epsilon, min_samples=min_samples)
    dbscan_fit = dbscan_model.fit(embeddings)
    dbscan_labels = dbscan_fit.labels_
    num_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    print('Num clusters: %s' % num_clusters)
    for i in range(num_clusters):
        print('Cluster %s: %s' % (i, list(compress(words, dbscan_labels==i))))

In [56]:
dbscan_clustering(words_bow2, embeddings_bow2, 4, 0.9)

Num clusters: 26
Cluster 0: ['accountant', 'adviser', 'analyst', 'artist', 'auditor', 'author', 'clerk', 'commentator', 'constable', 'consultant', 'councillor', 'critic', 'historian', 'journalist', 'judge', 'lawyer', 'magistrate', 'photographer', 'poet', 'politician', 'publisher', 'reporter', 'researcher', 'scholar', 'scientist', 'solicitor', 'writer']
Cluster 1: ['afternoon', 'evening', 'morning', 'night']
Cluster 2: ['alice', 'anna', 'anne', 'caroline', 'diana', 'elizabeth', 'emily', 'helen', 'isabel', 'jane', 'kate', 'laura', 'lucy', 'maggie', 'marie', 'mary', 'rachel', 'robyn', 'ruth', 'sarah', 'susan']
Cluster 3: ['analysis', 'assessment', 'calculation', 'evaluation', 'inspection', 'measurement']
Cluster 4: ['anger', 'anxiety', 'confusion', 'contempt', 'disappointment', 'embarrassment', 'emotion', 'enthusiasm', 'excitement', 'frustration', 'hostility', 'laughter', 'pain', 'passion', 'respect', 'sympathy', 'uncertainty', 'warmth']
Cluster 5: ['ankle', 'breast', 'brow', 'cheek', 'ch

In [65]:
dbscan_clustering(words_bow5, embeddings_bow5, 4, 0.9)

Num clusters: 35
Cluster 0: ['accountant', 'actor', 'adviser', 'analyst', 'architect', 'artist', 'assistant', 'auditor', 'author', 'cabinet', 'captain', 'chair', 'chairman', 'chief', 'clerk', 'command', 'commander', 'commentator', 'commissioner', 'constable', 'consultant', 'court', 'critic', 'dancer', 'deputy', 'designer', 'director', 'engineer', 'executive', 'farmer', 'head', 'historian', 'inspector', 'journalist', 'judge', 'justice', 'lawyer', 'magistrate', 'manager', 'minister', 'ministry', 'officer', 'photographer', 'poet', 'politician', 'president', 'prime', 'producer', 'publisher', 'reporter', 'researcher', 'scholar', 'scientist', 'secretary', 'sergeant', 'solicitor', 'trustee', 'writer']
Cluster 1: ['adam', 'alice', 'anna', 'anne', 'baker', 'benjamin', 'blanche', 'brother', 'caroline', 'charles', 'charlie', 'clarke', 'colleague', 'corbett', 'cousin', 'daughter', 'david', 'diana', 'edward', 'elizabeth', 'emily', 'father', 'francis', 'friend', 'george', 'graham', 'harry', 'helen',

In [69]:
dbscan_clustering(words_deps, embeddings_deps, 4, 0.9)

Num clusters: 34
Cluster 0: ['ability', 'acceptance', 'accountant', 'accuracy', 'achievement', 'acquisition', 'action', 'actor', 'admission', 'adoption', 'advice', 'adviser', 'affair', 'agenda', 'agreement', 'aircraft', 'album', 'allegation', 'allocation', 'allowance', 'ambassador', 'ambition', 'amendment', 'amount', 'analysis', 'analyst', 'anger', 'animal', 'ankle', 'announcement', 'answer', 'anxiety', 'apartment', 'applicant', 'application', 'approval', 'architect', 'argument', 'arrangement', 'artist', 'assessment', 'asset', 'assistance', 'assistant', 'assumption', 'attitude', 'auditor', 'author', 'awareness', 'bacterium', 'barrel', 'basket', 'bathroom', 'beauty', 'bedroom', 'beer', 'behaviour', 'belief', 'benefit', 'bike', 'bill', 'bird', 'blade', 'blanket', 'boat', 'bomb', 'bone', 'book', 'boot', 'bottle', 'brain', 'bread', 'breakfast', 'breast', 'breeze', 'brow', 'bucket', 'budget', 'bulk', 'buyer', 'cable', 'cake', 'calculation', 'cancer', 'candle', 'capability', 'capacity', 'cap