**The Flair library includes Word2Vec and BERT models, which are employed for generating embeddings and conducting comparisons between them.**

In [1]:
!pip install flair

Collecting flair
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting segtok>=1.5.7 (from flair)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting mpld3==0.3 (from flair)
  Downloading mpld3-0.3.tar.gz (788 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m788.5/788.5 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sqlitedict>=1.6.0 (from flair)
  Downloading sqlitedict-2.1.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deprecated>=1.2.4 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting boto3 (from flair)
  Downloading boto3-1.28.23-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollect

In [2]:
import numpy as np
from flair.embeddings import WordEmbeddings
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence
from scipy.spatial import distance

# Glove Embeddings Example

In [3]:
glove_embedding = WordEmbeddings('glove')

2023-08-09 20:42:47,562 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpcvd6dxrh


100%|██████████| 153M/153M [00:11<00:00, 13.4MB/s]

2023-08-09 20:43:00,048 copying /tmp/tmpcvd6dxrh to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2023-08-09 20:43:01,035 removing temp file /tmp/tmpcvd6dxrh
2023-08-09 20:43:01,954 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpulrrwdr7


100%|██████████| 20.5M/20.5M [00:01<00:00, 13.5MB/s]

2023-08-09 20:43:04,268 copying /tmp/tmpulrrwdr7 to cache at /root/.flair/embeddings/glove.gensim
2023-08-09 20:43:04,307 removing temp file /tmp/tmpulrrwdr7





## Sentence 1

In [4]:
sentence_1 = Sentence('apple released iphone 12 pro max in 2020')

In [5]:
glove_embedding.embed(sentence_1)

[Sentence[8]: "apple released iphone 12 pro max in 2020"]

In [6]:
for token in sentence_1:
    print(token)
    print(token.embedding)
    print("\n")

Token[0]: "apple"
tensor([-0.5985, -0.4632,  0.1300, -0.0196,  0.4603, -0.3018,  0.8977, -0.6563,
         0.6686, -0.4916,  0.0376, -0.0509,  0.6451, -0.5388, -0.3765, -0.0431,
         0.5138,  0.1778,  0.2860,  0.9206, -0.4935, -0.4858,  0.6132,  0.7821,
         0.1925,  0.9123, -0.0556, -0.1251, -0.6569,  0.0686,  0.5563,  1.6110,
        -0.0074, -0.4888,  0.4549,  0.9610, -0.0634,  0.1743,  0.9814, -1.3125,
        -0.1580, -0.5430, -0.1389, -0.2615, -0.3691,  0.2684, -0.2438, -0.1948,
         0.6258, -0.7377,  0.3835, -0.7500, -0.3905,  0.0915, -0.3659, -1.4715,
        -0.4523,  0.2256,  1.1412, -0.3853, -0.0672,  0.5729, -0.3919,  0.3130,
        -0.2923, -0.9616,  0.1515, -0.2166,  0.2510,  0.0970,  0.2843,  1.4296,
        -0.5056, -0.5137, -0.4722,  0.3204,  0.0231,  0.2262, -0.0972,  0.8213,
         0.9260, -1.0086, -0.3864,  0.8641, -1.2060, -0.2853,  0.2265, -0.3877,
         0.4088,  0.5930,  0.3077,  0.8380, -0.6366, -0.4464, -0.4341, -0.7936,
        -0.2867, -0.03

In [7]:
sentence_1[0]

Token[0]: "apple"

In [8]:
sentence_1[0].embedding

tensor([-0.5985, -0.4632,  0.1300, -0.0196,  0.4603, -0.3018,  0.8977, -0.6563,
         0.6686, -0.4916,  0.0376, -0.0509,  0.6451, -0.5388, -0.3765, -0.0431,
         0.5138,  0.1778,  0.2860,  0.9206, -0.4935, -0.4858,  0.6132,  0.7821,
         0.1925,  0.9123, -0.0556, -0.1251, -0.6569,  0.0686,  0.5563,  1.6110,
        -0.0074, -0.4888,  0.4549,  0.9610, -0.0634,  0.1743,  0.9814, -1.3125,
        -0.1580, -0.5430, -0.1389, -0.2615, -0.3691,  0.2684, -0.2438, -0.1948,
         0.6258, -0.7377,  0.3835, -0.7500, -0.3905,  0.0915, -0.3659, -1.4715,
        -0.4523,  0.2256,  1.1412, -0.3853, -0.0672,  0.5729, -0.3919,  0.3130,
        -0.2923, -0.9616,  0.1515, -0.2166,  0.2510,  0.0970,  0.2843,  1.4296,
        -0.5056, -0.5137, -0.4722,  0.3204,  0.0231,  0.2262, -0.0972,  0.8213,
         0.9260, -1.0086, -0.3864,  0.8641, -1.2060, -0.2853,  0.2265, -0.3877,
         0.4088,  0.5930,  0.3077,  0.8380, -0.6366, -0.4464, -0.4341, -0.7936,
        -0.2867, -0.0344,  1.3431,  0.34

In [9]:
sentence_1[0].embedding.shape

torch.Size([100])

## Sentence 2

In [10]:
sentence_2 = Sentence('an apple a day keeps the doctor away')

In [11]:
glove_embedding.embed(sentence_2)

[Sentence[8]: "an apple a day keeps the doctor away"]

In [12]:
for token in sentence_2:
    print(token)
    print(token.embedding)
    print("\n")

Token[0]: "an"
tensor([-0.4214, -0.1880,  0.4624, -0.1761,  0.3621,  0.3670,  0.2792,  0.1463,
        -0.0542,  0.4583,  0.0654, -0.3372,  0.0675, -0.3632,  0.5030, -0.0104,
         0.7283, -0.1756, -0.3400,  0.0729,  0.6448, -0.2391,  0.3838,  0.1386,
         1.0994, -0.2488, -0.1508, -0.4874, -0.2304,  0.0648, -0.7018,  0.8265,
         0.0613,  0.1853, -0.3016, -0.0222,  0.3430,  0.8033,  0.1714,  0.1546,
        -0.5076,  0.3957,  0.0543, -0.5308,  0.4825,  0.0862,  0.5958, -0.2238,
        -0.3955, -0.7304, -0.1028, -0.3917,  1.2290,  1.2129, -1.0365, -3.4971,
         0.1092, -1.0084,  1.9998,  0.7964,  0.3881,  0.4375,  0.0852,  0.3855,
         0.6199, -1.0320,  0.7012, -0.2246,  0.0794,  0.0913, -0.2120, -0.5543,
        -0.0534, -0.8020,  0.4680, -0.0501, -0.5742, -0.0848, -1.7227, -0.9429,
         0.9867,  0.3121, -0.3774,  0.0687, -0.7784, -0.2849,  0.8105,  0.4660,
        -0.1186, -0.9341,  0.3372,  0.0379, -0.1827, -0.0199,  0.2049, -0.4772,
        -0.4925, -0.5652,

In [13]:
sentence_2[1]

Token[1]: "apple"

In [14]:
sentence_2[1].embedding

tensor([-0.5985, -0.4632,  0.1300, -0.0196,  0.4603, -0.3018,  0.8977, -0.6563,
         0.6686, -0.4916,  0.0376, -0.0509,  0.6451, -0.5388, -0.3765, -0.0431,
         0.5138,  0.1778,  0.2860,  0.9206, -0.4935, -0.4858,  0.6132,  0.7821,
         0.1925,  0.9123, -0.0556, -0.1251, -0.6569,  0.0686,  0.5563,  1.6110,
        -0.0074, -0.4888,  0.4549,  0.9610, -0.0634,  0.1743,  0.9814, -1.3125,
        -0.1580, -0.5430, -0.1389, -0.2615, -0.3691,  0.2684, -0.2438, -0.1948,
         0.6258, -0.7377,  0.3835, -0.7500, -0.3905,  0.0915, -0.3659, -1.4715,
        -0.4523,  0.2256,  1.1412, -0.3853, -0.0672,  0.5729, -0.3919,  0.3130,
        -0.2923, -0.9616,  0.1515, -0.2166,  0.2510,  0.0970,  0.2843,  1.4296,
        -0.5056, -0.5137, -0.4722,  0.3204,  0.0231,  0.2262, -0.0972,  0.8213,
         0.9260, -1.0086, -0.3864,  0.8641, -1.2060, -0.2853,  0.2265, -0.3877,
         0.4088,  0.5930,  0.3077,  0.8380, -0.6366, -0.4464, -0.4341, -0.7936,
        -0.2867, -0.0344,  1.3431,  0.34

In [15]:
sentence_2[1].embedding.shape

torch.Size([100])

## Glove Distance between the same word

In [16]:
glove_dst = distance.euclidean(np.array(sentence_1[0].embedding),
                               np.array(sentence_2[1].embedding))

In [17]:
print("Distance between apple embeddings for Glove = {}".format(glove_dst))

Distance between apple embeddings for Glove = 0.0


# Bert Embeddings

In [18]:
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [19]:
bert_embedding.embed(sentence_1)
for token in sentence_1:
    print(token)
    print(token.embedding)

Token[0]: "apple"
tensor([-5.9850e-01, -4.6321e-01,  1.3001e-01, -1.9576e-02,  4.6030e-01,
        -3.0180e-01,  8.9770e-01, -6.5634e-01,  6.6858e-01, -4.9164e-01,
         3.7557e-02, -5.0889e-02,  6.4510e-01, -5.3882e-01, -3.7650e-01,
        -4.3120e-02,  5.1384e-01,  1.7783e-01,  2.8596e-01,  9.2063e-01,
        -4.9349e-01, -4.8583e-01,  6.1321e-01,  7.8211e-01,  1.9254e-01,
         9.1228e-01, -5.5596e-02, -1.2512e-01, -6.5688e-01,  6.8557e-02,
         5.5629e-01,  1.6110e+00, -7.3642e-03, -4.8879e-01,  4.5493e-01,
         9.6105e-01, -6.3369e-02,  1.7432e-01,  9.8140e-01, -1.3125e+00,
        -1.5801e-01, -5.4301e-01, -1.3888e-01, -2.6146e-01, -3.6910e-01,
         2.6844e-01, -2.4375e-01, -1.9484e-01,  6.2583e-01, -7.3770e-01,
         3.8351e-01, -7.5004e-01, -3.9053e-01,  9.1498e-02, -3.6591e-01,
        -1.4715e+00, -4.5228e-01,  2.2560e-01,  1.1412e+00, -3.8526e-01,
        -6.7160e-02,  5.7288e-01, -3.9191e-01,  3.1302e-01, -2.9235e-01,
        -9.6157e-01,  1.5154e-01,

In [20]:
sentence_1[0]

Token[0]: "apple"

In [21]:
sentence_1[0].embedding

tensor([-5.9850e-01, -4.6321e-01,  1.3001e-01, -1.9576e-02,  4.6030e-01,
        -3.0180e-01,  8.9770e-01, -6.5634e-01,  6.6858e-01, -4.9164e-01,
         3.7557e-02, -5.0889e-02,  6.4510e-01, -5.3882e-01, -3.7650e-01,
        -4.3120e-02,  5.1384e-01,  1.7783e-01,  2.8596e-01,  9.2063e-01,
        -4.9349e-01, -4.8583e-01,  6.1321e-01,  7.8211e-01,  1.9254e-01,
         9.1228e-01, -5.5596e-02, -1.2512e-01, -6.5688e-01,  6.8557e-02,
         5.5629e-01,  1.6110e+00, -7.3642e-03, -4.8879e-01,  4.5493e-01,
         9.6105e-01, -6.3369e-02,  1.7432e-01,  9.8140e-01, -1.3125e+00,
        -1.5801e-01, -5.4301e-01, -1.3888e-01, -2.6146e-01, -3.6910e-01,
         2.6844e-01, -2.4375e-01, -1.9484e-01,  6.2583e-01, -7.3770e-01,
         3.8351e-01, -7.5004e-01, -3.9053e-01,  9.1498e-02, -3.6591e-01,
        -1.4715e+00, -4.5228e-01,  2.2560e-01,  1.1412e+00, -3.8526e-01,
        -6.7160e-02,  5.7288e-01, -3.9191e-01,  3.1302e-01, -2.9235e-01,
        -9.6157e-01,  1.5154e-01, -2.1659e-01,  2.5

In [22]:
sentence_1[0].embedding.shape

torch.Size([868])

In [23]:
bert_embedding.embed(sentence_2)

for token in sentence_2:
    print(token)
    print(token.embedding)

Token[0]: "an"
tensor([-4.2140e-01, -1.8797e-01,  4.6241e-01, -1.7605e-01,  3.6212e-01,
         3.6701e-01,  2.7924e-01,  1.4634e-01, -5.4227e-02,  4.5834e-01,
         6.5416e-02, -3.3725e-01,  6.7505e-02, -3.6316e-01,  5.0302e-01,
        -1.0361e-02,  7.2826e-01, -1.7564e-01, -3.3996e-01,  7.2864e-02,
         6.4481e-01, -2.3908e-01,  3.8383e-01,  1.3858e-01,  1.0994e+00,
        -2.4883e-01, -1.5078e-01, -4.8738e-01, -2.3042e-01,  6.4788e-02,
        -7.0183e-01,  8.2654e-01,  6.1280e-02,  1.8531e-01, -3.0162e-01,
        -2.2151e-02,  3.4302e-01,  8.0331e-01,  1.7135e-01,  1.5462e-01,
        -5.0759e-01,  3.9572e-01,  5.4291e-02, -5.3081e-01,  4.8252e-01,
         8.6205e-02,  5.9585e-01, -2.2377e-01, -3.9550e-01, -7.3036e-01,
        -1.0279e-01, -3.9166e-01,  1.2290e+00,  1.2129e+00, -1.0365e+00,
        -3.4971e+00,  1.0923e-01, -1.0084e+00,  1.9998e+00,  7.9640e-01,
         3.8810e-01,  4.3746e-01,  8.5194e-02,  3.8549e-01,  6.1993e-01,
        -1.0320e+00,  7.0119e-01, -2

In [24]:
sentence_2[1]

Token[1]: "apple"

In [25]:
sentence_2[1].embedding

tensor([-5.9850e-01, -4.6321e-01,  1.3001e-01, -1.9576e-02,  4.6030e-01,
        -3.0180e-01,  8.9770e-01, -6.5634e-01,  6.6858e-01, -4.9164e-01,
         3.7557e-02, -5.0889e-02,  6.4510e-01, -5.3882e-01, -3.7650e-01,
        -4.3120e-02,  5.1384e-01,  1.7783e-01,  2.8596e-01,  9.2063e-01,
        -4.9349e-01, -4.8583e-01,  6.1321e-01,  7.8211e-01,  1.9254e-01,
         9.1228e-01, -5.5596e-02, -1.2512e-01, -6.5688e-01,  6.8557e-02,
         5.5629e-01,  1.6110e+00, -7.3642e-03, -4.8879e-01,  4.5493e-01,
         9.6105e-01, -6.3369e-02,  1.7432e-01,  9.8140e-01, -1.3125e+00,
        -1.5801e-01, -5.4301e-01, -1.3888e-01, -2.6146e-01, -3.6910e-01,
         2.6844e-01, -2.4375e-01, -1.9484e-01,  6.2583e-01, -7.3770e-01,
         3.8351e-01, -7.5004e-01, -3.9053e-01,  9.1498e-02, -3.6591e-01,
        -1.4715e+00, -4.5228e-01,  2.2560e-01,  1.1412e+00, -3.8526e-01,
        -6.7160e-02,  5.7288e-01, -3.9191e-01,  3.1302e-01, -2.9235e-01,
        -9.6157e-01,  1.5154e-01, -2.1659e-01,  2.5

In [26]:
sentence_2[1].embedding.shape

torch.Size([868])

In [27]:
bert_dst = distance.euclidean(np.array(sentence_1[0].embedding),
                               np.array(sentence_2[1].embedding))

In [28]:
print("Distance between apple embeddings for Glove = {}".format(bert_dst))

Distance between apple embeddings for Glove = 15.98473072052002
