# Word arithmetic
In this practice notebook, we'll explore how you can calculate with word embeddings.

If needed (e.g. on Google Colab), first install the necessary packages and download the correct model by uncommenting the following two cells:

In [None]:
# !pip install wordfreq spacy-transformers

In [None]:
# !python -m spacy download en_core_web_lg

In [None]:
import spacy
import spacy_transformers
import numpy as np
from wordfreq import zipf_frequency

Vectors are only available in larger models, so let's load that first:

In [None]:
nlp = spacy.load('en_core_web_lg')

s = "It's not about the money (only $20.15), it's about sending a message :). 🚀💎🙌"
doc = nlp(s)


In [None]:
token = doc[5]
token

In [None]:
token = doc[1]
token.vector

In [None]:
token.vector.size

In [None]:
def get_vocab_id(word: str):
    return nlp.vocab.strings[word]

def get_vector(word: str):
    return nlp.vocab.vectors[get_vocab_id(word)]

def get_token(word: str):
    return nlp(word)[0]
    

## Finding similarities

In [None]:
get_token("woman").similarity(get_token("queen"))

In [None]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

In [None]:
tokens = nlp(u'castle king student error')

for t1 in tokens:
    for t2 in tokens:
        print(f"{t1.text}, {t2.text}, {t1.similarity(t2):.3f}")

## Find most similar words to a given vector
Documentation https://spacy.io/api/vectors#most_similar

In [None]:
def find_most_similar(vec: np.ndarray, include_rare = False):
    # vec = vector(word)

    vocab_ids = nlp.vocab.vectors.most_similar(np.asarray([vec]), n=100)
    words =  [nlp.vocab.strings[w] for w in vocab_ids[0][0]]
    if include_rare:
        return [w for w in words if get_token(w).is_alpha][0:20]
    else:
        return [w for w in words if get_token(w).is_alpha & (zipf_frequency(w, "en", wordlist='small', minimum=1)> 3)][0:20]



In [None]:
find_most_similar(get_vector("doctor"))

In [None]:
find_most_similar(get_vector("king") - get_vector("man") + get_vector("girl"))

In [None]:
find_most_similar(get_vector("doctor") - get_vector("man") + get_vector("woman"))

In [None]:
find_most_similar(get_vector("castle") - get_vector("royalty") + get_vector("student"))

In [None]:
find_most_similar(get_vector("Berlin") - get_vector("Germany") + get_vector("Japan"))[0]

In [None]:
find_most_similar(get_vector("bigger") - get_vector("big") + get_vector("cold"))

In [None]:
find_most_similar(get_vector("sushi") - get_vector("Japan") + get_vector("France"))

In [None]:
find_most_similar(get_vector("Merkel") - get_vector("Germany") + get_vector("Canada"))

## Further reading
- Library for embedding exploration: https://github.com/koaning/whatlies/