## Word Vectors

In [13]:
# word vectors occupy lot of space. hence en_core_web_sm model do not have them included.
# In order to download word vectors you need to install large or medium english model.
import spacy

In [6]:
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
nlp = spacy.load("en_core_web_lg")

In [15]:
doc = nlp("dog cat banana kem")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov) #OOV = out of vocabulary

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: True OOV: False


In [16]:
doc[0].vector.shape

(300,)

In [17]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [20]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6874560117721558
burger <-> bread: 0.544037401676178
car <-> bread: 0.16441147029399872
tiger <-> bread: 0.14492356777191162
human <-> bread: 0.21103660762310028
wheat <-> bread: 0.6572456359863281


#### Compare similarity function

In [21]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [22]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.6339781284332275
samsung <-> iphone:  0.6678677797317505
iphone <-> iphone:  1.0
dog <-> iphone:  0.1743103712797165
kitten <-> iphone:  0.1468581259250641


In [25]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.78808445]], dtype=float32)

#### Cosine similarity

In [27]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

similar case

In [29]:
cosine_similarity([[3,1]],[[6,2]])

array([[1.]])

In [31]:
cosine_similarity([[3,1]],[[3,2]])

array([[0.96476382]])

non similar case

In [30]:
cosine_similarity([[3,0]],[[0,8]])

array([[0.]])

Real Documents

In [33]:
doc1 = """
iphone sales contributed to 70% of revenue. iphone demand is increasing by 20% yoy.
the main competitor phone galaxy recorded 5% less growth compared to iphone"
"""

doc2 = """
The upside pressure on volumes for the iPhone 12 series, historical outperformance
in the July-September time period heading into launch event, and further catalysts in relation
to outperformance for iPhone 13 volumes relative to lowered investor expectations implies a
very attractive set up for the shares.
"""

doc3 = """
samsung's flagship product galaxy is able to penetrate more into asian markets compared to
iphone. galaxy is redesigned with new look that appeals young demographics. 60% of samsung revenues
are coming from galaxy phone sales
"""

doc4 = """
Samsung Electronics unveils its Galaxy S21 flagship, with modest spec improvements
and a significantly lower price point. Galaxy S21 price is lower by ~20% (much like the iPhone 12A),
which highlights Samsung's focus on boosting shipments and regaining market share.
"""

In [34]:
import pandas as pd

df = pd.DataFrame([
        {'iPhone': 3,'galaxy': 1},
        {'iPhone': 2,'galaxy': 0},
        {'iPhone': 1,'galaxy': 3},
        {'iPhone': 1,'galaxy': 2},
    ],
    index=[
        "doc1",
        "doc2",
        "doc3",
        "doc4"
    ])

In [35]:
df

Unnamed: 0,iPhone,galaxy
doc1,3,1
doc2,2,0
doc3,1,3
doc4,1,2


In [37]:
df.loc["doc1":"doc1"] #First row

Unnamed: 0,iPhone,galaxy
doc1,3,1


In [38]:
cosine_similarity(df.loc["doc1":"doc1"],df.loc["doc2":"doc2"])

array([[0.9486833]])

In [39]:
cosine_similarity(df.loc["doc1":"doc1"],df.loc["doc3":"doc3"])

array([[0.6]])

In [40]:
cosine_similarity(df.loc["doc3":"doc3"],df.loc["doc4":"doc4"])

array([[0.98994949]])

In [41]:
cosine_similarity(df.loc["doc1":"doc1"],df.loc["doc4":"doc4"])

array([[0.70710678]])