**The purpose of this section is to illustrate the idea of random projections preserving structure with the concrete example of word vectors.**

# 1)- Importing key modules

In [0]:
import pickle
import numpy as np
import re
import json

In [0]:
np.set_printoptions(precision=4, suppress=True)

# 2)- Loading data

In [3]:
! wget http://files.fast.ai/models/glove_50_glove_100.tgz 
! tar xvzf glove_50_glove_100.tgz

--2019-08-24 20:59:37--  http://files.fast.ai/models/glove_50_glove_100.tgz
Resolving files.fast.ai (files.fast.ai)... 67.205.15.147
Connecting to files.fast.ai (files.fast.ai)|67.205.15.147|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 225083583 (215M) [application/x-gtar-compressed]
Saving to: ‘glove_50_glove_100.tgz.1’


2019-08-24 20:59:39 (105 MB/s) - ‘glove_50_glove_100.tgz.1’ saved [225083583/225083583]

glove_vectors_100d.npy
glove_vectors_50d.npy
words.txt
wordsidx.txt


In [0]:
vecs = np.load("glove_vectors_100d.npy")

In [0]:
with open("words.txt") as f:
    content = f.readlines()
words = [x.strip() for x in content]

In [0]:
wordidx = json.load(open("wordsidx.txt"))

# 3)- Exploring data

In [7]:
len(words)

400000

In [8]:
vecs[wordidx['python']]

array([ 0.2493,  0.6832, -0.0447, -1.3842, -0.0073,  0.651 , -0.3396,
       -0.1979, -0.3392,  0.2669, -0.0331,  0.1592,  0.8955,  0.54  ,
       -0.5582,  0.4624,  0.3672,  0.1889,  0.8319,  0.8142, -0.1183,
       -0.5346,  0.2416, -0.0389,  1.1907,  0.7935, -0.1231,  0.6642,
       -0.7762, -0.4571, -1.054 , -0.2056, -0.133 ,  0.1224,  0.8846,
        1.024 ,  0.3229,  0.821 , -0.0694,  0.0242, -0.5142,  0.8727,
        0.2576,  0.9153, -0.6422,  0.0412, -0.6021,  0.5463,  0.6608,
        0.198 , -1.1393,  0.7951,  0.4597, -0.1846, -0.6413, -0.2493,
       -0.4019, -0.5079,  0.8058,  0.5336,  0.5273,  0.3925, -0.2988,
        0.0096,  0.9995, -0.0613,  0.7194,  0.329 , -0.0528,  0.6714,
       -0.8025, -0.2579,  0.4961,  0.4808, -0.684 , -0.0122,  0.0482,
        0.2946,  0.2061,  0.3356, -0.6417, -0.6471,  0.1338, -0.1257,
       -0.4638,  1.3878,  0.9564, -0.0679, -0.0017,  0.5296,  0.4567,
        0.6104, -0.1151,  0.4263,  0.1734, -0.7995, -0.245 , -0.6089,
       -0.3847, -0.4

In [9]:
words[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [10]:
words[600:610]

['together',
 'congress',
 'index',
 'australia',
 'results',
 'hard',
 'hours',
 'land',
 'action',
 'higher']

In [11]:
wordidx['python']

20019

In [12]:
words[20019]

'python'

# 4)-Words as vectors

In [13]:
vecs[wordidx['python']]

array([ 0.2493,  0.6832, -0.0447, -1.3842, -0.0073,  0.651 , -0.3396,
       -0.1979, -0.3392,  0.2669, -0.0331,  0.1592,  0.8955,  0.54  ,
       -0.5582,  0.4624,  0.3672,  0.1889,  0.8319,  0.8142, -0.1183,
       -0.5346,  0.2416, -0.0389,  1.1907,  0.7935, -0.1231,  0.6642,
       -0.7762, -0.4571, -1.054 , -0.2056, -0.133 ,  0.1224,  0.8846,
        1.024 ,  0.3229,  0.821 , -0.0694,  0.0242, -0.5142,  0.8727,
        0.2576,  0.9153, -0.6422,  0.0412, -0.6021,  0.5463,  0.6608,
        0.198 , -1.1393,  0.7951,  0.4597, -0.1846, -0.6413, -0.2493,
       -0.4019, -0.5079,  0.8058,  0.5336,  0.5273,  0.3925, -0.2988,
        0.0096,  0.9995, -0.0613,  0.7194,  0.329 , -0.0528,  0.6714,
       -0.8025, -0.2579,  0.4961,  0.4808, -0.684 , -0.0122,  0.0482,
        0.2946,  0.2061,  0.3356, -0.6417, -0.6471,  0.1338, -0.1257,
       -0.4638,  1.3878,  0.9564, -0.0679, -0.0017,  0.5296,  0.4567,
        0.6104, -0.1151,  0.4263,  0.1734, -0.7995, -0.245 , -0.6089,
       -0.3847, -0.4

In [0]:
from scipy.spatial.distance import cosine as dist

In [15]:
dist(vecs[wordidx["puppy"]], vecs[wordidx["dog"]])

0.27636247873306274

In [16]:
dist(vecs[wordidx["queen"]], vecs[wordidx["princess"]])

0.20527541637420654

In [17]:
dist(vecs[wordidx["celebrity"]], vecs[wordidx["dusty"]])

0.9883578838780522

In [18]:
dist(vecs[wordidx["avalanche"]], vecs[wordidx["antique"]])

0.9621107056736946

### Bias

In [19]:
dist(vecs[wordidx["man"]], vecs[wordidx["genius"]])

0.5098515152931213

In [20]:
dist(vecs[wordidx["woman"]], vecs[wordidx["genius"]])

0.689783364534378

# 5)-Visualizations

In [0]:
import plotly
import plotly.graph_objs as go    
from IPython.display import IFrame

In [0]:
def plotly_3d(Y, cat_labels, filename="temp-plot.html"):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter3d(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            z=Y[i*5:(i+1)*5, 2],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout,
    }, filename=filename)

In [0]:
def plotly_2d(Y, cat_labels, filename="temp-plot.html"):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout
    }, filename=filename)

In [0]:
def get_components(data, categories, word_indices):
    num_components = 30
    pca = decomposition.PCA(n_components=num_components).fit(data.T)
    all_components = pca.components_
    centroids = {}
    print(all_components.shape)
    for i, category in enumerate(categories):
        cen = np.mean(all_components[:, i*5:(i+1)*5], axis = 1)
        dist_within_cats = np.sum(np.abs(np.expand_dims(cen, axis=1) - all_components[:, i*5:(i+1)*5]), axis=1)
        centroids[category] = cen
    dist_btwn_cats = np.zeros(num_components)
    for category1, averages1 in centroids.items():
        for category2, averages2 in centroids.items():
            dist_btwn_cats += abs(averages1 - averages2)
            clusterness = dist_btwn_cats / dist_within_cats
    comp_indices = np.argpartition(clusterness, -3)[-3:]
    return all_components[comp_indices]

In [0]:
# preparing data
my_words = [
            "maggot", "flea", "tarantula", "bedbug", "mosquito", 
            "violin", "cello", "flute", "harp", "mandolin",
            "joy", "love", "peace", "pleasure", "wonderful",
            "agony", "terrible", "horrible", "nasty", "failure", 
            "physics", "chemistry", "science", "technology", "engineering",
            "poetry", "art", "literature", "dance", "symphony",
           ]

In [0]:
categories = [
              "bugs", "music", 
              "pleasant", "unpleasant", 
              "science", "arts"
             ]

In [0]:
my_word_indices = np.array([wordidx[word] for word in my_words])

In [28]:
vecs[my_word_indices].shape

(30, 100)

In [29]:
embeddings = np.concatenate((vecs[my_word_indices], vecs[:10000,:]), axis=0); embeddings.shape

(10030, 100)

### 5.1)PCA 

In [0]:
from collections import defaultdict
from sklearn import decomposition

In [31]:
components = get_components(embeddings, categories, my_word_indices)
plotly_3d(components.T[:len(my_words),:], categories, "pca.html")

(30, 10030)


In [32]:
IFrame('pca.html', width=600, height=400)

### 5.2)- Random Projections

- Johnson-Lindenstrauss Lemma: 

a small set of points in a high-dimensional space can be embedded into a space of much lower dimension in such a way that distances between the points are nearly preserved (proof uses random projections).

It is useful to be able to reduce dimensionality of data in a way that preserves distances. The Johnson–Lindenstrauss lemma is a classic result of this type.

In [33]:
embeddings.shape

(10030, 100)

In [34]:
rand_proj = embeddings @ np.random.normal(size=(embeddings.shape[1], 40)); rand_proj.shape

(10030, 40)

In [35]:
pca = decomposition.PCA(n_components=3).fit(rand_proj.T)
components = pca.components_
components = get_components(rand_proj, categories, my_word_indices)
plotly_3d(components.T[:len(my_words),:], categories, "pca-rand-proj.html")

(30, 10030)


In [36]:
IFrame('pca-rand-proj.html', width=600, height=400)