# Word Embeddings

In [58]:
import spacy
from gensim.models import KeyedVectors

from scipy import spatial

The following may need to be run inside your terminal if linking fails

We'll load the word embeddings that we just downloaded

In [59]:
nlp = spacy.load("en_core_web_md")

In [61]:
# gensim with Glove vectors, 
glove_vectors = KeyedVectors.load_word2vec_format('./vectors/glove_840B/word2vec.840B.300d.txt')

## Similar Words With SpacY _(Done By Hand)_

First lets demonstrate the power of word embeddings by generating similar words:

In [45]:
def top_n_similar_words(target, n=10, reverse=False): 
    if isinstance(target, str):
        target = nlp.vocab[target].vector # the vector of our target word

    top_n_similar_words = [] # the list of similar words we'll be returning
    
    cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y) # our helper function for calculating similarity between two vectors

    computed_similarities = [] # a list of vectors for all possible words
    
    for word in nlp.vocab:
        # Ignore words without vectors
        if not word.has_vector:
            continue

        similarity = cosine_similarity(target, word.vector)
        computed_similarities.append((word, similarity))

    if reverse:
        computed_similarities = sorted(computed_similarities, key=lambda item: -item[1], reverse=True) # computing our similarities
    else: 
        computed_similarities = sorted(computed_similarities, key=lambda item: -item[1]) # computing our similarities
    
    similar_words = [w[0].text for w in computed_similarities] # grabbing the text component of our similiar vectors
    
    index = 0
    while(len(top_n_similar_words) != n):
        word = similar_words[index].lower() # making words case insensitive to reduce duplicates
        # print(index)
        # print(f"evaluating {word}")
        
        if word not in top_n_similar_words: # duplicate check
            top_n_similar_words.append(word)
            # print(f"adding {word}")
            # print(f"top_n_similar_words {top_n_similar_words}\n")
        
        index += 1
    
    return top_n_similar_words

In [22]:
top_n_similar_words("potato")

['kumara',
 'tuber',
 'sweet-potato',
 'chickpea',
 'potato',
 'potatoe',
 'yuca',
 'tuberosum',
 'spud',
 'taro']

## Similar Words with Gensim

In [62]:
glove_words = [i[0] for i in glove_vectors.most_similar(positive=["potato"], topn=10)]
glove_words

['potatoes',
 'carrot',
 'tomato',
 'onion',
 'salad',
 'mashed',
 'cabbage',
 'spinach',
 'rice',
 'carrots']

### Spacy Built In Similarity

In [36]:
vector1 = nlp(u"potato").vector
most_similar = nlp(u"potato").vocab.vectors.most_similar(vector1.reshape(1, vector1.shape[0]))
most_similar

(array([12974958259688887324], dtype=uint64),
 array([3596], dtype=int32),
 array([1.], dtype=float32))

In [33]:
nlp.vocab[12974958259688887324].text

'sweet-potato'

## Opposite (Least Similar) Word

With a simple reverse of the sorted similarity scores we can come up with the least similar words...

In [46]:
top_n_similar_words("potato", reverse=True)

['mason-dixon',
 '85-plus',
 'swd',
 'srd',
 'wotc',
 'coys',
 'stainers',
 'cargo-handling',
 'superpremium',
 '21-jewel']

## Simple Arithmatic

Now lets show how we can do simple arithmatic with word embeddings. 

A king and queen are the male and female version of the same royal position. Lets see if our word embeddings can prove to us it knows this.

In [26]:
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
 
maybe_queen = king - man + woman

# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
top_n_similar_words(maybe_queen)

['king',
 'queen',
 'prince',
 'commoner',
 'highness',
 'sultan',
 'maharajas',
 'kings',
 'princes',
 'sultans']

## Simple Arithmatic with gensim

In [6]:
glove_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('queen', 0.775162398815155),
 ('prince', 0.6123066544532776),
 ('princess', 0.6016970872879028),
 ('kings', 0.5996100902557373),
 ('queens', 0.565579891204834),
 ('royal', 0.5646308660507202),
 ('throne', 0.5580971240997314),
 ('Queen', 0.5569202899932861),
 ('monarch', 0.5499411821365356),
 ('empress', 0.5295248627662659)]

## Filtering

Lets say we wanted to get similar words for billiards, aka pool tables

In [47]:
top_n_similar_words("pool")

['swimmingpool',
 'swimming-pool',
 'sauna',
 'pool',
 'chlorinators',
 'swim-up',
 'pools',
 'inground',
 'cabanas',
 'hottubs']

We see here that all of our words are coming back as swimming pools. What can we do?

We could:

- manually prune this list
- do string matching
- hire an intern (free labor)

Instead, lets use word embedddings!

In [65]:
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

pool = nlp.vocab["pool"].vector
swimming = nlp.vocab["swimming"].vector
water = nlp.vocab["water"].vector
 
billiards = (pool - swimming) - water

top_n_similar_words(billiards)

['com-plete',
 'synbiotics',
 'emazing',
 'selectives',
 '451.7',
 '120-million',
 'lenni',
 '518.6',
 '37lbs',
 'pilons']

In [11]:
glove_vectors.most_similar(positive=['pool'], negative=['water'], topn=10)

[('Pool', 0.4190593957901001),
 ('Poolside', 0.4075739085674286),
 ('jacuzzi', 0.40432262420654297),
 ('billiards', 0.3948186933994293),
 ('Jacuzzi', 0.38681161403656006),
 ('Penthouse', 0.37995702028274536),
 ('poolside', 0.37497371435165405),
 ('solarium', 0.3684753179550171),
 ('Billiards', 0.36510801315307617),
 ('Clubhouse', 0.36037442088127136)]

In [66]:
glove_vectors.most_similar(positive=['pool'], negative=['water', 'swimming'], topn=10)

[('AfricaLinking', 0.36948105692863464),
 ('TrainingResources', 0.33629927039146423),
 ('DrivenPrice', 0.33019590377807617),
 ('6/Pkg', 0.32633116841316223),
 ('MzAwNTo1OjEw', 0.32195737957954407),
 ('ShowId', 0.32156652212142944),
 ('Footnication', 0.3199766278266907),
 ('VERNIS', 0.3184490501880646),
 ('ThermalPrice', 0.3171261250972748),
 ('TTC-After-Loss', 0.31558042764663696)]