# Word Embedding

- Word2vec is a technique in natural language processing for obtaining vector representations of words.

- These vectors capture information about the meaning of the word based on the surrounding words.

- The word2vec algorithm estimates these representations by modeling text in a large corpus.

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
sample_txt= "Word2vec is a technique in natural language processing for obtaining vector representations of words. These vectors capture information about the meaning of the word based on the surrounding words. The word2vec algorithm estimates these representations by modeling text in a large corpus."

# Tokeinize into sentence and words

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
sentences = sent_tokenize(sample_txt)
sentences

['Word2vec is a technique in natural language processing for obtaining vector representations of words.',
 'These vectors capture information about the meaning of the word based on the surrounding words.',
 'The word2vec algorithm estimates these representations by modeling text in a large corpus.']

In [None]:
len(sentences)

3

In [None]:
words = [word_tokenize(sentence.lower()) for sentence in sentences]
words

[['word2vec',
  'is',
  'a',
  'technique',
  'in',
  'natural',
  'language',
  'processing',
  'for',
  'obtaining',
  'vector',
  'representations',
  'of',
  'words',
  '.'],
 ['these',
  'vectors',
  'capture',
  'information',
  'about',
  'the',
  'meaning',
  'of',
  'the',
  'word',
  'based',
  'on',
  'the',
  'surrounding',
  'words',
  '.'],
 ['the',
  'word2vec',
  'algorithm',
  'estimates',
  'these',
  'representations',
  'by',
  'modeling',
  'text',
  'in',
  'a',
  'large',
  'corpus',
  '.']]

In [None]:
print("words in first sentence: ", len(words[0]))
print("words in second sentence: ", len(words[1]))
print("words in third sentence: ", len(words[2]))

words in first sentence:  15
words in second sentence:  16
words in third sentence:  14


# Training the Word2Vec model

In [None]:
model = Word2Vec(words, vector_size = 100, window = 5, min_count = 1, sg = 0)

In [None]:
model

<gensim.models.word2vec.Word2Vec at 0x7b7ad1c76a50>

Model is trained successfully

# Access word vectors

In [None]:
word_vectors = model.wv
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7b7ad1c769c0>

In [None]:
word_vectors['word2vec']

array([-8.2428372e-03,  9.2984429e-03, -1.9998940e-04, -1.9658776e-03,
        4.6030493e-03, -4.0925085e-03,  2.7440235e-03,  6.9394079e-03,
        6.0658003e-03, -7.5076558e-03,  9.3803490e-03,  4.6709334e-03,
        3.9668530e-03, -6.2403916e-03,  8.4562693e-03, -2.1501984e-03,
        8.8250125e-03, -5.3604059e-03, -8.1277974e-03,  6.8243071e-03,
        1.6724035e-03, -2.1985143e-03,  9.5138298e-03,  9.4973259e-03,
       -9.7736456e-03,  2.5039108e-03,  6.1575030e-03,  3.8735415e-03,
        2.0208112e-03,  4.2975679e-04,  6.7557947e-04, -3.8203050e-03,
       -7.1384548e-03, -2.0905063e-03,  3.9231577e-03,  8.8186925e-03,
        9.2601432e-03, -5.9756017e-03, -9.4035622e-03,  9.7652040e-03,
        3.4301660e-03,  5.1662601e-03,  6.2797749e-03, -2.8037310e-03,
        7.3243217e-03,  2.8301962e-03,  2.8689492e-03, -2.3821539e-03,
       -3.1270056e-03, -2.3705910e-03,  4.2752884e-03,  7.6377728e-05,
       -9.5830159e-03, -9.6661020e-03, -6.1476883e-03, -1.2968633e-04,
      

In [None]:
word_vectors['natural']

array([-4.2922487e-03, -9.3237376e-03, -1.8702546e-03, -3.7103512e-03,
        9.1020716e-03,  2.9348901e-03, -5.9832372e-03, -3.3542465e-03,
       -9.9078445e-03,  1.9436935e-03, -3.9885859e-03, -2.7072844e-03,
        5.1666521e-03,  7.5955731e-03,  4.3889638e-03, -6.8260129e-03,
        6.6934898e-03, -9.6287131e-03, -6.9871545e-03, -8.0079250e-03,
       -2.2674531e-03,  4.7694563e-04, -3.7715556e-03,  1.8417817e-03,
        9.2039034e-03,  6.6820821e-03, -1.0570971e-03,  9.0957126e-03,
        1.5556883e-03, -1.9757692e-03, -6.5976209e-03,  6.4712954e-03,
       -9.0731690e-03, -1.8934859e-03,  7.6980670e-03, -2.5881424e-03,
        1.6043333e-03, -7.3458739e-03,  4.9790028e-03,  7.3755234e-03,
       -9.3046091e-03, -3.2862977e-03,  7.8895260e-03,  2.9423626e-03,
        1.4131000e-05,  3.4133767e-04, -9.8615531e-03,  9.9383350e-03,
       -2.0010779e-03, -4.8948247e-03,  2.7553439e-03,  1.0292698e-03,
       -5.9254053e-03,  5.7957410e-03, -2.5074147e-03, -6.1775139e-03,
      

# Finding similar words

In [None]:
similar_words = word_vectors.most_similar('natural', topn = 5)
similar_words

[('in', 0.17128004133701324),
 ('word', 0.13664701581001282),
 ('based', 0.13280262053012848),
 ('corpus', 0.10279709100723267),
 ('capture', 0.09614437818527222)]

# Another example:

In [None]:
sample_txt2 = "I own a big garden. There are many different kinds of trees there. The garden has several kinds of fruits and vegetables in the garden. They were fresh and organic. Numerous people work there to help me make this garden beautiful and profitable."

In [None]:
sentences = sent_tokenize(sample_txt2)

tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
tokens

[['i', 'own', 'a', 'big', 'garden', '.'],
 ['there', 'are', 'many', 'different', 'kinds', 'of', 'trees', 'there', '.'],
 ['the',
  'garden',
  'has',
  'several',
  'kinds',
  'of',
  'fruits',
  'and',
  'vegetables',
  'in',
  'the',
  'garden',
  '.'],
 ['they', 'were', 'fresh', 'and', 'organic', '.'],
 ['numerous',
  'people',
  'work',
  'there',
  'to',
  'help',
  'me',
  'make',
  'this',
  'garden',
  'beautiful',
  'and',
  'profitable',
  '.']]

In [None]:
model2 = Word2Vec(tokens, vector_size= 50, window = 5, min_count = 1, sg = 0)
model2

<gensim.models.word2vec.Word2Vec at 0x7b7ad189e840>

In [None]:
word_vectors2 = model2.wv
word_vectors2['fruits']

array([-0.01031249, -0.01333668, -0.01555367,  0.01662146, -0.00396468,
       -0.01370993, -0.00830878,  0.01028825, -0.00573827, -0.00749932,
        0.00324285, -0.00555259, -0.00316872,  0.00214898, -0.00595589,
        0.01703857,  0.00782187, -0.01991773,  0.0125192 , -0.01350851,
        0.00153887,  0.00880847, -0.01020675, -0.00422134,  0.01619096,
       -0.00848759, -0.01527252,  0.01851583, -0.00431099, -0.00943886,
        0.0171416 ,  0.00856668,  0.00864967,  0.01856903, -0.01690616,
        0.01051064,  0.0040787 ,  0.00837656,  0.0033958 ,  0.00892827,
        0.00897258,  0.01220904, -0.00640419, -0.00915146, -0.00085303,
        0.00506747, -0.00652633,  0.01211544,  0.00830826,  0.01552918],
      dtype=float32)

In [None]:
word_vectors2['trees']

array([-3.8936534e-03, -1.0540222e-02,  1.8900611e-02, -1.8591881e-02,
        9.0011330e-03,  1.0807554e-02, -2.8076845e-03,  1.8007707e-02,
        1.9757302e-02, -1.0961521e-02, -1.2041060e-02, -1.3502317e-02,
       -1.5782792e-02, -6.1042244e-03, -1.1185234e-02, -1.6690802e-02,
        1.5602552e-03,  5.9878849e-03,  1.2817775e-02, -5.2702357e-03,
       -8.9121610e-03,  2.5099290e-03,  7.9417182e-04,  1.6235650e-02,
        3.6568262e-04,  1.4463658e-02, -1.6534857e-02,  1.6872678e-02,
       -3.7829380e-03,  1.7410109e-02, -1.5228156e-02,  3.5810855e-03,
        2.1101360e-03,  8.8391454e-05, -1.0207213e-02, -1.8502278e-02,
       -1.4527084e-02, -1.5901458e-02,  3.8224447e-03,  9.5161429e-04,
       -3.6229535e-03,  1.4242809e-02, -4.9607265e-03, -2.6958315e-03,
       -1.7797375e-02, -1.9858673e-02,  1.7902877e-02, -1.1511053e-02,
       -1.2747886e-02,  1.0401206e-02], dtype=float32)

In [None]:
similar_word2 = word_vectors2.most_similar('fruits', topn = 5)
similar_word2

[('.', 0.27068236470222473),
 ('make', 0.2674153745174408),
 ('they', 0.20419305562973022),
 ('and', 0.19622384011745453),
 ('different', 0.19218479096889496)]

In [None]:
similar_word3 = model2.wv.most_similar('numerous', topn = 5)
similar_word3

[('are', 0.2576709985733032),
 ('beautiful', 0.23732392489910126),
 ('of', 0.16954173147678375),
 ('help', 0.16496333479881287),
 ('and', 0.12171069532632828)]

NOTE: I don't know why the words like 'many', 'several' etc. didn't appear in similar words for 'numerous'.