# 3. Word Analogy Task

# 4. Clustering Word Vectors

In [1]:
import string
from itertools import compress

def filter_cluster_embeddings(model_file):

    #Load 2000 nouns
    with open("2000_nouns_sorted.txt", "r") as f:
        nouns = f.read().splitlines()

    nouns = [noun.strip() for noun in nouns]
    
    #Filter punctuation from noun words   
    new_nouns = []
    for noun in nouns:
        for c in string.punctuation:
            noun=noun.replace(c,"")
        new_nouns.append(noun)
    nouns = new_nouns

    #Load trained embeddings
    words_dict = {}
    with open(model_file, "r") as f:
        for line in f:
            word = line.split()
            words_dict[word[0]] = word[1:] 
            
    #Filter 2000 words
    words_dict_2000 = { noun: words_dict[noun] for noun in nouns}
    
    #Get values by sorted keys    
    words = sorted(words_dict_2000.keys())
    embeddings = [words_dict_2000[key] for key in sorted(words_dict_2000.keys())]
    
    return words, embeddings

In [2]:
words_bow2, embeddings_bow2 = filter_cluster_embeddings("bow2.words")

In [3]:
words_bow5, embeddings_bow5 = filter_cluster_embeddings("bow5.words")

In [4]:
words_deps, embeddings_deps = filter_cluster_embeddings("deps.words")

### Visualization

In [20]:
# PCA
from sklearn.decomposition import PCA

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import d3
from bokeh.io import output_notebook
output_notebook()

def pca_visualization(num_components, embeddings, words, title):
    pca = PCA(n_components=num_components)
    pca_result = pca.fit_transform(embeddings)
    print('Variation per principal component: {}'.format(pca.explained_variance_ratio_))
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title=title)
    source = ColumnDataSource(data=dict(x1=pca_result[:,0],
                                        x2=pca_result[:,1],
                                        names=words))
    
    p.scatter(x="x1", y="x2", size=8, source=source)
    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    show(p)

In [21]:
pca_visualization(2, embeddings_bow2, words_bow2, "PCA-bow2")
pca_visualization(2, embeddings_bow5, words_bow5, "PCA-bow5")
pca_visualization(2, embeddings_deps, words_deps, "PCA-deps")

Variation per principal component: [0.03162729 0.02683354]


Variation per principal component: [0.04026231 0.03309658]


Variation per principal component: [0.03549473 0.03201278]


In [27]:
#t-SNE
from sklearn.manifold import TSNE

def tsne_visualization(words, embeddings, p, num_iter, title):
    tsne = TSNE(n_components=2, verbose=1, perplexity=p, n_iter=num_iter)
    tsne_results = tsne.fit_transform(embeddings)
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title=title)
    source = ColumnDataSource(data=dict(x1=tsne_results[:,0],
                                        x2=tsne_results[:,1],
                                        names=words))
    
    p.scatter(x="x1", y="x2", size=8, source=source)
    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    show(p)

In [29]:
tsne_visualization(words_bow2, embeddings_bow2, 10, 2000, "t-SNE bow2")
tsne_visualization(words_bow5, embeddings_bow5, 10, 2000, "t-SNE bow5")
tsne_visualization(words_deps, embeddings_deps, 10, 2000, "t-SNE deps")

[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.030s...
[t-SNE] Computed neighbors for 1996 samples in 3.621s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.268582
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.225777
[t-SNE] Error after 2000 iterations: 1.835670


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.025s...
[t-SNE] Computed neighbors for 1996 samples in 3.136s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.291273
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.338448
[t-SNE] Error after 2000 iterations: 1.785992


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1996 samples in 0.020s...
[t-SNE] Computed neighbors for 1996 samples in 2.600s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1996
[t-SNE] Computed conditional probabilities for sample 1996 / 1996
[t-SNE] Mean sigma: 0.264414
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.543404
[t-SNE] Error after 2000 iterations: 1.688525


### KMeans

In [30]:
from sklearn.cluster import KMeans


def kmeans_clustering(words, embeddings, num_clusters):
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans_fit = kmeans_model.fit(embeddings)
    kmeans_labels = kmeans_fit.labels_
    for i in range(num_clusters):
        print('Cluster %s: %s' % (i, list(compress(words, kmeans_labels==i))))

In [31]:
kmeans_clustering(words_bow2, embeddings_bow2, 20)

Cluster 0: ['acre', 'apartment', 'area', 'barrier', 'bath', 'beach', 'bedroom', 'bell', 'block', 'border', 'boundary', 'breeze', 'brick', 'bridge', 'building', 'camp', 'canal', 'castle', 'cathedral', 'chamber', 'chapel', 'church', 'circle', 'city', 'cliff', 'complex', 'corner', 'corridor', 'cottage', 'countryside', 'county', 'cross', 'district', 'doorway', 'downstairs', 'east', 'entrance', 'estate', 'factory', 'farm', 'floor', 'forest', 'gallery', 'garage', 'garden', 'gate', 'grave', 'ground', 'grounds', 'hall', 'hill', 'home', 'horizon', 'hospital', 'hotel', 'house', 'interior', 'island', 'kitchen', 'lake', 'landscape', 'lane', 'line', 'location', 'middle', 'mile', 'mill', 'mine', 'mountain', 'nest', 'north', 'ocean', 'palace', 'park', 'passage', 'pond', 'pool', 'port', 'prison', 'prospect', 'quarter', 'railway', 'restaurant', 'ridge', 'river', 'road', 'room', 'route', 'seat', 'section', 'settlement', 'shelf', 'shelter', 'shop', 'shore', 'side', 'site', 'slope', 'south', 'square', 'st

In [32]:
kmeans_clustering(words_bow5, embeddings_bow5, 20)

Cluster 0: ['agency', 'airline', 'association', 'authority', 'bank', 'board', 'branch', 'brand', 'business', 'capital', 'chain', 'chamber', 'chapter', 'charity', 'clinic', 'commission', 'committee', 'community', 'company', 'corp', 'corporation', 'council', 'department', 'division', 'entity', 'facility', 'factory', 'family', 'firm', 'foundation', 'group', 'headquarters', 'holding', 'hospital', 'industry', 'institute', 'institution', 'joint', 'journal', 'label', 'library', 'major', 'manufacturer', 'media', 'ministry', 'museum', 'network', 'newspaper', 'office', 'operation', 'operator', 'organisation', 'organization', 'parent', 'partnership', 'plant', 'press', 'public', 'sector', 'shareholder', 'society', 'stake', 'store', 'subsidiary', 'supplier', 'telephone', 'trust', 'unit', 'university', 'venture', 'workshop']
Cluster 1: ['access', 'accommodation', 'acquisition', 'administration', 'advance', 'advice', 'allocation', 'analysis', 'application', 'architecture', 'assessment', 'assistance',

In [33]:
kmeans_clustering(words_deps, embeddings_deps, 20)

Cluster 0: ['administration', 'agency', 'airline', 'alliance', 'army', 'assembly', 'association', 'authority', 'award', 'bank', 'board', 'branch', 'cabinet', 'campaign', 'ceremony', 'chamber', 'championship', 'chapter', 'charity', 'circuit', 'clinic', 'club', 'coalition', 'college', 'commission', 'committee', 'community', 'company', 'competition', 'concert', 'conference', 'congress', 'convention', 'corp', 'corporation', 'council', 'court', 'crew', 'delegation', 'department', 'division', 'embassy', 'enterprise', 'event', 'exhibition', 'expedition', 'faction', 'family', 'festival', 'firm', 'fleet', 'foundation', 'fund', 'gang', 'government', 'group', 'industry', 'initiative', 'institute', 'institution', 'jury', 'laboratory', 'league', 'medal', 'meeting', 'ministry', 'mission', 'movement', 'network', 'office', 'organisation', 'organization', 'panel', 'parliament', 'partnership', 'party', 'police', 'prize', 'program', 'programme', 'project', 'regime', 'school', 'sector', 'service', 'sessio

### Agglomerative Clustering

In [34]:
from sklearn.cluster import AgglomerativeClustering


def agglomerative_clustering(words, embeddings, num_clusters):
    agglomerative_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    agglomerative_fit = agglomerative_model.fit(embeddings)
    agglomerative_labels = agglomerative_fit.labels_
    for i in range(num_clusters):
        print('Cluster %s: %s' % (i, list(compress(words, agglomerative_labels==i))))

In [35]:
agglomerative_clustering(words_bow2, embeddings_bow2, 20)

Cluster 0: ['advertisement', 'advertising', 'allowance', 'applicant', 'asset', 'bank', 'budget', 'burden', 'business', 'buyer', 'cable', 'cash', 'channel', 'cheque', 'client', 'coin', 'collector', 'company', 'compensation', 'consumer', 'consumption', 'corp', 'corporation', 'cost', 'credit', 'creditor', 'currency', 'customer', 'dealer', 'debt', 'defendant', 'demand', 'dividend', 'dollar', 'earnings', 'electricity', 'employee', 'employer', 'employment', 'energy', 'enterprise', 'equipment', 'estate', 'expenditure', 'expense', 'export', 'finance', 'firm', 'fortune', 'gift', 'goods', 'import', 'income', 'industry', 'insider', 'insurance', 'interview', 'investment', 'investor', 'journal', 'land', 'landlord', 'load', 'loan', 'machinery', 'magazine', 'maker', 'manufacturer', 'market', 'media', 'money', 'mortgage', 'network', 'news', 'newspaper', 'offender', 'owner', 'paper', 'papers', 'patient', 'payment', 'penny', 'pension', 'people', 'person', 'plaintiff', 'portfolio', 'post', 'pound', 'powe

In [36]:
agglomerative_clustering(words_bow5, embeddings_bow5, 20)

Cluster 0: ['achievement', 'acre', 'addition', 'adult', 'animal', 'array', 'award', 'back', 'ball', 'basket', 'best', 'bike', 'blanket', 'blow', 'bowl', 'break', 'career', 'century', 'challenge', 'champion', 'championship', 'change', 'chart', 'check', 'citizen', 'class', 'club', 'community', 'competition', 'competitor', 'contest', 'contribution', 'count', 'counter', 'couple', 'crack', 'current', 'date', 'decade', 'defender', 'desk', 'distance', 'division', 'draft', 'draw', 'drive', 'enemy', 'entry', 'equivalent', 'event', 'face', 'female', 'field', 'final', 'football', 'format', 'forward', 'future', 'gain', 'gaze', 'glance', 'gold', 'grade', 'grin', 'ground', 'habit', 'half', 'handful', 'hold', 'holder', 'holding', 'honour', 'horse', 'hour', 'human', 'hundred', 'inch', 'individual', 'inhabitant', 'joke', 'journey', 'kick', 'kiss', 'laugh', 'league', 'left', 'level', 'living', 'look', 'major', 'majority', 'male', 'match', 'medal', 'merit', 'mess', 'metre', 'middle', 'mile', 'million', '

In [37]:
agglomerative_clustering(words_deps, embeddings_deps, 20)

Cluster 0: ['access', 'advertisement', 'advertising', 'album', 'alternative', 'application', 'arrangement', 'article', 'aspect', 'asset', 'basis', 'bible', 'book', 'brand', 'calculation', 'catalogue', 'category', 'chance', 'chapter', 'characteristic', 'choice', 'circumstance', 'class', 'code', 'collection', 'combination', 'companion', 'component', 'composition', 'condition', 'content', 'contents', 'database', 'datum', 'defect', 'determination', 'diary', 'diet', 'drama', 'element', 'equation', 'equivalent', 'essay', 'estimate', 'factor', 'feature', 'film', 'formula', 'fragment', 'framework', 'function', 'grade', 'ground', 'grounds', 'guide', 'habit', 'headline', 'hierarchy', 'history', 'horror', 'host', 'image', 'implementation', 'impulse', 'incentive', 'index', 'information', 'ingredient', 'input', 'interface', 'link', 'links', 'list', 'location', 'material', 'measure', 'measurement', 'mechanism', 'medium', 'method', 'mixture', 'mode', 'model', 'movie', 'mystery', 'name', 'norm', 'nove

### Density-based Clustering (e.g: DBSCAN)

In [38]:
from sklearn.cluster import DBSCAN

def dbscan_clustering(words, embeddings, min_samples, epsilon):
    dbscan_model = DBSCAN(eps=epsilon, min_samples=min_samples)
    dbscan_fit = dbscan_model.fit(embeddings)
    dbscan_labels = dbscan_fit.labels_
    num_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    print('Num clusters: %s' % num_clusters)
    for i in range(num_clusters):
        print('Cluster %s: %s' % (i, list(compress(words, dbscan_labels==i))))

In [39]:
dbscan_clustering(words_bow2, embeddings_bow2, 4, 0.9)

Num clusters: 26
Cluster 0: ['accountant', 'adviser', 'analyst', 'artist', 'auditor', 'author', 'clerk', 'commentator', 'constable', 'consultant', 'councillor', 'critic', 'historian', 'journalist', 'judge', 'lawyer', 'magistrate', 'photographer', 'poet', 'politician', 'publisher', 'reporter', 'researcher', 'scholar', 'scientist', 'solicitor', 'writer']
Cluster 1: ['afternoon', 'evening', 'morning', 'night']
Cluster 2: ['alice', 'anna', 'anne', 'caroline', 'diana', 'elizabeth', 'emily', 'helen', 'isabel', 'jane', 'kate', 'laura', 'lucy', 'maggie', 'marie', 'mary', 'rachel', 'robyn', 'ruth', 'sarah', 'susan']
Cluster 3: ['analysis', 'assessment', 'calculation', 'evaluation', 'inspection', 'measurement']
Cluster 4: ['anger', 'anxiety', 'confusion', 'contempt', 'disappointment', 'embarrassment', 'emotion', 'enthusiasm', 'excitement', 'frustration', 'hostility', 'laughter', 'pain', 'passion', 'respect', 'sympathy', 'uncertainty', 'warmth']
Cluster 5: ['ankle', 'breast', 'brow', 'cheek', 'ch

In [40]:
dbscan_clustering(words_bow5, embeddings_bow5, 4, 0.9)

Num clusters: 35
Cluster 0: ['accountant', 'actor', 'adviser', 'analyst', 'architect', 'artist', 'assistant', 'auditor', 'author', 'cabinet', 'captain', 'chair', 'chairman', 'chief', 'clerk', 'command', 'commander', 'commentator', 'commissioner', 'constable', 'consultant', 'court', 'critic', 'dancer', 'deputy', 'designer', 'director', 'engineer', 'executive', 'farmer', 'head', 'historian', 'inspector', 'journalist', 'judge', 'justice', 'lawyer', 'magistrate', 'manager', 'minister', 'ministry', 'officer', 'photographer', 'poet', 'politician', 'president', 'prime', 'producer', 'publisher', 'reporter', 'researcher', 'scholar', 'scientist', 'secretary', 'sergeant', 'solicitor', 'trustee', 'writer']
Cluster 1: ['adam', 'alice', 'anna', 'anne', 'baker', 'benjamin', 'blanche', 'brother', 'caroline', 'charles', 'charlie', 'clarke', 'colleague', 'corbett', 'cousin', 'daughter', 'david', 'diana', 'edward', 'elizabeth', 'emily', 'father', 'francis', 'friend', 'george', 'graham', 'harry', 'helen',

In [41]:
dbscan_clustering(words_deps, embeddings_deps, 4, 0.9)

Num clusters: 34
Cluster 0: ['ability', 'acceptance', 'accountant', 'accuracy', 'achievement', 'acquisition', 'action', 'actor', 'admission', 'adoption', 'advice', 'adviser', 'affair', 'agenda', 'agreement', 'aircraft', 'album', 'allegation', 'allocation', 'allowance', 'ambassador', 'ambition', 'amendment', 'amount', 'analysis', 'analyst', 'anger', 'animal', 'ankle', 'announcement', 'answer', 'anxiety', 'apartment', 'applicant', 'application', 'approval', 'architect', 'argument', 'arrangement', 'artist', 'assessment', 'asset', 'assistance', 'assistant', 'assumption', 'attitude', 'auditor', 'author', 'awareness', 'bacterium', 'barrel', 'basket', 'bathroom', 'beauty', 'bedroom', 'beer', 'behaviour', 'belief', 'benefit', 'bike', 'bill', 'bird', 'blade', 'blanket', 'boat', 'bomb', 'bone', 'book', 'boot', 'bottle', 'brain', 'bread', 'breakfast', 'breast', 'breeze', 'brow', 'bucket', 'budget', 'bulk', 'buyer', 'cable', 'cake', 'calculation', 'cancer', 'candle', 'capability', 'capacity', 'cap