# Word arithmetic
In this practice notebook, we'll explore how you can calculate with word embeddings.

If needed (e.g. on Google Colab), first install the necessary packages and download the correct model by uncommenting the following two cells:

In [None]:
# !pip install wordfreq spacy-transformers

In [None]:
# !python -m spacy download en_core_web_lg

In [2]:
import spacy
import numpy as np
from wordfreq import zipf_frequency

Vectors are only available in larger models, so let's load that first:

In [3]:
nlp = spacy.load('en_core_web_lg')

s = "It's not about the money (only $20.15), it's about sending a message :). 🚀💎🙌"
doc = nlp(s)


In [4]:
token = doc[5]
token

money

In [5]:
token = doc[1]
token.vector

array([-6.8580e-02,  4.6470e-01,  1.3214e-01,  1.8599e-01, -3.7015e-02,
        3.2988e-01,  1.7865e-01, -2.5977e-01, -2.6022e-01,  2.5728e+00,
       -2.5867e-01, -6.6095e-01,  8.1984e-02,  1.0321e-02, -1.2223e-01,
        9.4609e-03, -8.8657e-02,  5.8367e-01, -1.7465e-02, -3.5569e-01,
       -1.0182e-01,  6.1941e-02, -1.4267e-01, -4.0544e-01,  2.9834e-01,
        1.0003e-01,  3.5899e-02,  2.2920e-01,  3.0278e-01, -1.8259e-01,
       -1.1042e-03,  2.5792e-01, -5.4132e-02,  1.5748e-01,  6.1311e-02,
       -3.0055e-01,  3.3732e-01,  4.0023e-01,  4.2472e-02, -3.0014e-01,
        6.2963e-02,  7.2134e-02,  6.0897e-02, -6.2527e-02,  2.7505e-01,
       -1.3527e-01, -2.1710e-01,  1.9315e-02,  3.8683e-02, -1.2361e-01,
       -7.7210e-02, -1.1320e-01, -9.3050e-02,  3.5217e-01,  1.9300e-01,
        4.8418e-02, -2.0489e-01,  9.6088e-02,  7.7817e-02, -3.7924e-01,
        1.1290e-01, -1.8285e-01, -5.6815e-02,  3.7091e-01,  3.2133e-01,
       -1.6343e-01, -3.0290e-01,  2.0258e-01, -1.9113e-01, -4.18

In [6]:
token.vector.size

300

In [7]:
def get_vocab_id(word: str):
    return nlp.vocab.strings[word]

def get_vector(word: str):
    return nlp.vocab.vectors[get_vocab_id(word)]

def get_token(word: str):
    return nlp(word)[0]
    

## Finding similarities

In [8]:
get_token("woman").similarity(get_token("queen"))

0.4066064953804016

In [9]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.5265437364578247
cat pet 0.7505456209182739
lion cat 0.5265437364578247
lion lion 1.0
lion pet 0.39923766255378723
pet cat 0.7505456209182739
pet lion 0.39923766255378723
pet pet 1.0


In [10]:
tokens = nlp(u'castle king student error')

for t1 in tokens:
    for t2 in tokens:
        print(f"{t1.text}, {t2.text}, {t1.similarity(t2):.3f}")

castle, castle, 1.000
castle, king, 0.437
castle, student, 0.087
castle, error, 0.027
king, castle, 0.437
king, king, 1.000
king, student, 0.119
king, error, 0.110
student, castle, 0.087
student, king, 0.119
student, student, 1.000
student, error, 0.162
error, castle, 0.027
error, king, 0.110
error, student, 0.162
error, error, 1.000


## Find most similar words to a given vector
Documentation https://spacy.io/api/vectors#most_similar

In [13]:
def find_most_similar(vec: np.ndarray, include_rare = False):
    # vec = vector(word)

    vocab_ids = nlp.vocab.vectors.most_similar(np.asarray([vec]), n=100)
    words =  [nlp.vocab.strings[w] for w in vocab_ids[0][0]]
    if include_rare:
        return [w.lower() for w in words if get_token(w).is_alpha][0:20]
    else:
        return [w.lower() for w in words if get_token(w).is_alpha & (zipf_frequency(w, "en", wordlist='small', minimum=1)> 3)][0:20]



In [14]:
find_most_similar(get_vector("doctor"))

['doctor',
 'physician',
 'doctors',
 'pharmacist',
 'surgeon',
 'medical',
 'nurse',
 'medicine',
 'medication',
 'patient',
 'pediatrician',
 'psychiatrist',
 'clinic',
 'dentist',
 'medications',
 'meds',
 'hospital',
 'dermatologist',
 'neurologist',
 'surgery']

In [15]:
find_most_similar(get_vector("king") - get_vector("man") + get_vector("girl"))

['queen',
 'king',
 'princess',
 'prince',
 'kings',
 'girl',
 'queens',
 'royal',
 'princesses',
 'throne',
 'kingdom',
 'princes',
 'girls',
 'duke',
 'empress',
 'barbie',
 'angel',
 'fairy',
 'sister',
 'daughter']

In [16]:
find_most_similar(get_vector("doctor") - get_vector("man") + get_vector("woman"))

['doctor',
 'nurse',
 'doctors',
 'physician',
 'pregnant',
 'woman',
 'pharmacist',
 'midwife',
 'medical',
 'pediatrician',
 'patient',
 'pregnancy',
 'surgeon',
 'clinic',
 'medication',
 'medicine',
 'hospital',
 'nurses',
 'psychiatrist',
 'therapist']

In [17]:
find_most_similar(get_vector("castle") - get_vector("royalty") + get_vector("student"))

['student',
 'school',
 'castle',
 'university',
 'students',
 'campus',
 'college',
 'graduate',
 'teacher',
 'undergraduate',
 'classroom',
 'professor',
 'tutor',
 'faculty',
 'elementary',
 'dormitory',
 'schools',
 'teaching',
 'classmate',
 'academic']

In [18]:
find_most_similar(get_vector("Berlin") - get_vector("Germany") + get_vector("Japan"))[0]

'tokyo'

In [None]:
find_most_similar(get_vector("bigger") - get_vector("big") + get_vector("cold"))

In [None]:
find_most_similar(get_vector("sushi") - get_vector("Japan") + get_vector("France"))

In [None]:
find_most_similar(get_vector("Merkel") - get_vector("Germany") + get_vector("Canada"))

## Further reading
- Library for embedding exploration: https://github.com/koaning/whatlies/