In [1]:
!pip install wordfreq


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [44]:
import spacy
import spacy_transformers
import numpy as np
from wordfreq import zipf_frequency

Vectors are only available in larger models, so let's download that first:

In [51]:
nlp = spacy.load('en_core_web_lg')

s = "It's not about the money (only $20.15), it's about sending a message :). 🚀💎🙌"
doc = nlp(s)


In [52]:
token = doc[1]
token

's

In [53]:
def get_vocab_id(word: str):
    return nlp.vocab.strings[word]

def get_vector(word: str):
    return nlp.vocab.vectors[get_vocab_id(word)]

def get_token(word: str):
    return nlp(word)[0]
    

## Finding similarities

In [54]:
get_token("man").similarity(get_token("king"))

0.41661593317985535

In [55]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.3854507803916931
cat pet 0.732966423034668
lion cat 0.3854507803916931
lion lion 1.0
lion pet 0.20031583309173584
pet cat 0.732966423034668
pet lion 0.20031583309173584
pet pet 1.0


In [56]:
tokens = nlp(u'castle king student dormitory')

for t1 in tokens:
    for t2 in tokens:
        print(f"{t1.text}, {t2.text}, {t1.similarity(t2):.3f}")

castle, castle, 1.000
castle, king, 0.494
castle, student, -0.021
castle, dormitory, 0.370
king, castle, 0.494
king, king, 1.000
king, student, 0.201
king, dormitory, 0.224
student, castle, -0.021
student, king, 0.201
student, student, 1.000
student, dormitory, 0.470
dormitory, castle, 0.370
dormitory, king, 0.224
dormitory, student, 0.470
dormitory, dormitory, 1.000


## Find most similar words to a given vector
Documentation https://spacy.io/api/vectors#most_similar

In [57]:
def find_most_similar(vec: np.ndarray, include_rare = False):
    # vec = vector(word)

    vocab_ids = nlp.vocab.vectors.most_similar(np.asarray([vec]), n=100)
    words =  [nlp.vocab.strings[w] for w in vocab_ids[0][0]]
    if include_rare:
        return [w for w in words if get_token(w).is_alpha][0:20]
    else:
        return [w for w in words if get_token(w).is_alpha & (zipf_frequency(w, "en", wordlist='small', minimum=1)> 3)][0:20]



In [58]:
find_most_similar(get_vector("doctor"))

['doctor',
 'physician',
 'psychiatrist',
 'doctors',
 'dentist',
 'nurse',
 'pharmacist',
 'pediatrician',
 'surgeon',
 'proctor',
 'dermatologist',
 'veterinarian',
 'midwife',
 'psychiatrists',
 'therapist',
 'neurologist',
 'clinic',
 'medic',
 'Pediatrician',
 'physicians']

In [59]:
find_most_similar(get_vector("king") - get_vector("man") + get_vector("girl"))

['king',
 'princess',
 'princesses',
 'princes',
 'prince',
 'kings',
 'queen',
 'consort',
 'Mcqueen',
 'mcqueen',
 'monarch',
 'ruler',
 'rulers',
 'Princesses',
 'thrones',
 'kingdom',
 'monarchs',
 'throne',
 'kingdoms',
 'royal']

In [60]:
find_most_similar(get_vector("doctor") - get_vector("man") + get_vector("woman"))

['doctor',
 'pediatrician',
 'nurse',
 'midwife',
 'physician',
 'Pediatrician',
 'dermatologist',
 'dentist',
 'therapist',
 'Dermatologist',
 'clinic',
 'pharmacist',
 'doctors',
 'Midwife',
 'pediatrics',
 'psychiatrist',
 'veterinarian',
 'pediatric',
 'neurologist',
 'Physician']

In [61]:
find_most_similar(get_vector("castle") - get_vector("royalty") + get_vector("student"))

['student',
 'campus',
 'school',
 'dormitory',
 'university',
 'castle',
 'students',
 'classmate',
 'pupil',
 'teacher',
 'classroom',
 'headmaster',
 'undergraduate',
 'college',
 'Dormitory',
 'undergraduates',
 'gymnasium',
 'schoolboy',
 'pupils',
 'highschool']

In [62]:
find_most_similar(get_vector("Berlin") - get_vector("Germany") + get_vector("Japan"))[0]

'Tokyo'

In [70]:
find_most_similar(get_vector("bigger") - get_vector("big") + get_vector("cold"))

['cold',
 'colder',
 'drier',
 'warmer',
 'cooler',
 'milder',
 'freezing',
 'temperatures',
 'dry',
 'frosty',
 'frost',
 'temperature',
 'temperate',
 'damp',
 'chilly',
 'humid',
 'warms',
 'warmed',
 'heat',
 'Colder']

In [64]:
find_most_similar(get_vector("sushi") - get_vector("Japan") + get_vector("Belgium"))[0]

'waffles'

## Further reading
- Library for embedding exploration: https://github.com/koaning/whatlies/