# Word arithmetic
In this practice notebook, we'll explore how you can calculate with word embeddings.

If needed (e.g. on Google Colab), first install the necessary packages and download the correct model by uncommenting the following two cells:

In [2]:
# !pip install wordfreq spacy-transformers

In [None]:
# !python -m spacy download en_core_web_lg

In [3]:
import spacy
import spacy_transformers
import numpy as np
from wordfreq import zipf_frequency

  from .autonotebook import tqdm as notebook_tqdm


Vectors are only available in larger models, so let's load that first:

In [4]:
nlp = spacy.load('en_core_web_lg')

s = "It's not about the money (only $20.15), it's about sending a message :). 🚀💎🙌"
doc = nlp(s)


In [5]:
token = doc[5]
token

money

In [6]:
token = doc[1]
token.vector

array([ 3.3163e+00,  9.7209e+00, -3.1254e+00, -5.1013e+00,  1.2248e+01,
        7.4676e-01, -2.2017e+00,  7.7449e+00,  7.7495e+00,  2.4786e+00,
        2.9003e+00, -8.2717e+00, -1.8624e+00, -2.5267e+00, -2.4342e+00,
        9.2120e+00,  3.2893e+00,  4.1051e+00,  8.4594e+00,  6.1904e+00,
        4.0433e+00, -8.7288e+00, -5.1290e-01, -4.1820e+00,  5.4291e-01,
        4.1663e+00, -4.1425e+00, -3.5223e+00,  5.6047e+00,  6.7570e-01,
       -8.4769e+00,  7.1013e+00,  3.6598e+00, -3.7614e+00, -6.9369e+00,
        2.2243e+00, -1.9601e+00,  7.1614e+00, -5.6895e+00,  2.1034e+00,
       -5.8237e+00,  2.7890e+00, -2.9914e+00, -4.0231e+00, -4.2667e+00,
       -7.5207e-01,  4.5451e-01, -8.6609e+00, -1.7533e+00,  1.0410e+01,
       -2.7430e+00,  3.3978e-01,  8.1811e+00,  4.2430e+00,  6.6655e+00,
       -9.3432e+00,  1.7088e+00, -5.9095e+00,  4.9347e-01,  2.3570e+00,
        3.8203e+00, -5.9410e+00,  1.2467e+00, -1.5427e+00,  4.3948e+00,
       -2.0103e+00, -6.6615e-01, -2.7626e+00,  5.2211e+00, -6.83

In [7]:
token.vector.size

300

In [8]:
def get_vocab_id(word: str):
    return nlp.vocab.strings[word]

def get_vector(word: str):
    return nlp.vocab.vectors[get_vocab_id(word)]

def get_token(word: str):
    return nlp(word)[0]
    

## Finding similarities

In [9]:
get_token("woman").similarity(get_token("queen"))

0.4756779074668884

In [10]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.3854507803916931
cat pet 0.732966423034668
lion cat 0.3854507803916931
lion lion 1.0
lion pet 0.20031583309173584
pet cat 0.732966423034668
pet lion 0.20031583309173584
pet pet 1.0


In [11]:
tokens = nlp(u'castle king student error')

for t1 in tokens:
    for t2 in tokens:
        print(f"{t1.text}, {t2.text}, {t1.similarity(t2):.3f}")

castle, castle, 1.000
castle, king, 0.494
castle, student, -0.021
castle, error, 0.015
king, castle, 0.494
king, king, 1.000
king, student, 0.201
king, error, 0.155
student, castle, -0.021
student, king, 0.201
student, student, 1.000
student, error, 0.109
error, castle, 0.015
error, king, 0.155
error, student, 0.109
error, error, 1.000


## Find most similar words to a given vector
Documentation https://spacy.io/api/vectors#most_similar

In [12]:
def find_most_similar(vec: np.ndarray, include_rare = False):
    # vec = vector(word)

    vocab_ids = nlp.vocab.vectors.most_similar(np.asarray([vec]), n=100)
    words =  [nlp.vocab.strings[w] for w in vocab_ids[0][0]]
    if include_rare:
        return [w for w in words if get_token(w).is_alpha][0:20]
    else:
        return [w for w in words if get_token(w).is_alpha & (zipf_frequency(w, "en", wordlist='small', minimum=1)> 3)][0:20]



In [13]:
find_most_similar(get_vector("doctor"))

['doctor',
 'physician',
 'psychiatrist',
 'doctors',
 'dentist',
 'nurse',
 'pharmacist',
 'pediatrician',
 'surgeon',
 'proctor',
 'dermatologist',
 'veterinarian',
 'midwife',
 'psychiatrists',
 'therapist',
 'neurologist',
 'clinic',
 'medic',
 'Pediatrician',
 'physicians']

In [14]:
find_most_similar(get_vector("king") - get_vector("man") + get_vector("girl"))

['king',
 'princess',
 'princesses',
 'princes',
 'prince',
 'kings',
 'queen',
 'consort',
 'Mcqueen',
 'mcqueen',
 'monarch',
 'ruler',
 'rulers',
 'Princesses',
 'thrones',
 'kingdom',
 'monarchs',
 'throne',
 'kingdoms',
 'royal']

In [15]:
find_most_similar(get_vector("doctor") - get_vector("man") + get_vector("woman"))

['doctor',
 'pediatrician',
 'nurse',
 'midwife',
 'physician',
 'Pediatrician',
 'dermatologist',
 'dentist',
 'therapist',
 'Dermatologist',
 'clinic',
 'pharmacist',
 'doctors',
 'Midwife',
 'pediatrics',
 'psychiatrist',
 'veterinarian',
 'pediatric',
 'neurologist',
 'Physician']

In [16]:
find_most_similar(get_vector("castle") - get_vector("royalty") + get_vector("student"))

['student',
 'campus',
 'school',
 'dormitory',
 'university',
 'castle',
 'students',
 'classmate',
 'pupil',
 'teacher',
 'classroom',
 'headmaster',
 'undergraduate',
 'college',
 'Dormitory',
 'undergraduates',
 'gymnasium',
 'schoolboy',
 'pupils',
 'highschool']

In [17]:
find_most_similar(get_vector("Berlin") - get_vector("Germany") + get_vector("Japan"))[0]

'Tokyo'

In [18]:
find_most_similar(get_vector("bigger") - get_vector("big") + get_vector("cold"))

['cold',
 'colder',
 'drier',
 'warmer',
 'cooler',
 'milder',
 'freezing',
 'temperatures',
 'dry',
 'frosty',
 'frost',
 'temperature',
 'temperate',
 'damp',
 'chilly',
 'humid',
 'warms',
 'warmed',
 'heat',
 'Colder']

In [19]:
find_most_similar(get_vector("sushi") - get_vector("Japan") + get_vector("France"))

['Toulouse', 'marseille', 'fries', 'Provence']

In [24]:
find_most_similar(get_vector("Merkel") - get_vector("Germany") + get_vector("Canada"))

['Trudeau',
 'Merkel',
 'merkel',
 'Abbott',
 'Sturgeon',
 'Truss',
 'McConnell',
 'Canada',
 'Turnbull',
 'Thatcher',
 'Ontario',
 'Alberta',
 'NDP',
 'Johnson',
 'Manitoba',
 'Hancock',
 'Lawmaker',
 'McGill',
 'Ottawa',
 'Blair']

## Further reading
- Library for embedding exploration: https://github.com/koaning/whatlies/