In [4]:
# on bash !pip install python -m spacy download en_core_web_lg

In [6]:
#!python -m spacy download en_core_web_lg --user

In [5]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
# medium model has 
# large model has: 514k keys, 685k unique vectors (300 dimensions) 
nlp = spacy.load("en_core_web_lg")

### comparing different word vectors

In [11]:
doc = nlp("dog cat banana akakak")

In [18]:
# when large model is trained it did not see the word akakak, that's why it is false
# spaCy is using Glove embedding vectors. and this trained on popular english dataset. 
# words get their vectors from this training

for token in doc:
    print(token, "Vector:", token.has_vector, "OOV:", token.is_oov ) 

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
akakak Vector: False OOV: True


In [19]:
doc[0].vector.shape

(300,)

In [16]:
doc[0].vector[:5] # vector with 300 dimension

array([-0.40176 ,  0.37057 ,  0.021281, -0.34125 ,  0.049538],
      dtype=float32)

In [21]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [23]:
#comparing bread with sandwich, they are similar, and get high score
# car and bread is not appeared similar context, but wheat and bread appears. 
# similarity means they appear in similar context, so profit and lost may have similarity, although they are opposite meaning. 
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text}<->{base_token.text}:", token.similarity(base_token))


bread<->bread: 1.0
sandwich<->bread: 0.6874560014053445
burger<->bread: 0.5440373883702087
car<->bread: 0.1644114584391833
tiger<->bread: 0.1449235625942581
human<->bread: 0.21103660928832707
wheat<->bread: 0.6572456428272563


In [24]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc: 
        print(f"{token.text}<->{base_token.text}:", token.similarity(base_token))

In [25]:
print_similarity("iphone", "apple samsung iphone dog kitten")
# apple-iphone is expected to give more sim than samsun-iphone, not nec, this model is trained on some Google-Wiki news articles.
# in news people compare samsung and apple more versus apple-iphone. 
# whatever text this model trained it appears Samsung and Iphone have more similarity. 
# two words appear in same context will have more similarity. 

apple<->iphone: 0.6339781147910419
samsung<->iphone: 0.6678678014329177
iphone<->iphone: 1.0
dog<->iphone: 0.17431037640553934
kitten<->iphone: 0.14685812907484028


In [27]:
nlp.vocab["king"].vector[:5]

array([ 0.31542, -0.35068,  0.42923, -0.53825, -0.1848 ], dtype=float32)

In [29]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman # you take out man features from king vector and add woman gender factor. 

result[:5] # way to compare the result is using cosine similarity. 

array([ 0.514087  , -0.27846   ,  0.242767  ,  0.04548997, -0.259425  ],
      dtype=float32)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
cosine_similarity([result] , [queen])#not perfect, more than 0.5 is good

array([[0.78808445]], dtype=float32)

### Cosine Similarity

In [16]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [34]:
cosine_similarity([[3,1]], [[6,2]]) # expects 2-dim array

array([[1.]])

In [35]:
cosine_distances([[3,1]], [[6,2]]) # very close to zero

array([[1.11022302e-16]])

In [36]:
cosine_similarity([[3,1]], [[3,2]])

array([[0.96476382]])

In [1]:
doc1 = """
iphone sales contributed to 70% of revenue. iphone demand is increasing by 20% yoy. 
the main competitor phone galaxy recorded 5% less growth compared to iphone"
"""

doc2 = """
The upside pressure on volumes for the iPhone 12 series, historical outperformance 
in the July-September time period heading into launch event, and further catalysts in relation
to outperformance for iPhone 13 volumes relative to lowered investor expectations implies a 
very attractive set up for the shares.
"""

doc3 = """
samsung's flagship product galaxy is able to penetrate more into asian markets compared to
iphone. galaxy is redesigned with new look that appeals young demographics. 60% of samsung revenues
are coming from galaxy phone sales
"""

doc4 = """
Samsung Electronics unveils its Galaxy S21 flagship, with modest spec improvements 
and a significantly lower price point. Galaxy S21 price is lower by ~20% (much like the iPhone 12A), 
which highlights Samsung's focus on boosting shipments and regaining market share.
"""

In [2]:
import pandas as pd

In [14]:
df = pd.DataFrame(data= [
    {"iphone":3, "galaxy":1},
    {"iphone":2, "galaxy":0},
    {"iphone":1, "galaxy":3},
    {"iphone":1, "galaxy":2}
],
    index= ["doc1", "doc2", "doc3", "doc4"]
)

In [15]:
df

Unnamed: 0,iphone,galaxy
doc1,3,1
doc2,2,0
doc3,1,3
doc4,1,2


In [22]:
df.loc["doc1":"doc1"]

Unnamed: 0,iphone,galaxy
doc1,3,1


In [21]:
cosine_similarity(df.loc["doc1":"doc1"], df.loc["doc2":"doc2"])

array([[0.9486833]])

In [23]:
cosine_similarity(df.loc["doc1":"doc1"], df.loc["doc3":"doc3"])#some similarity

array([[0.6]])

$$Cosine Similarity = \frac {\sum \limits _{i=1} ^{n} A _{i} B _{i}}{\sqrt {\sum \limits _{i=1} ^{n} A _{i} ^{2}}\sqrt {\sum \limits _{i=1} ^{n} B _{i} ^{2}}}$$