Word embedding is a technique that converts words into dense numerical vectors in a continuous vector space, where similar words are mapped to nearby points. These vectors capture semantic relationships between words based on their context in large text corpora.

Key properties:

- Words with similar meanings have similar vector representations

- Vector arithmetic works meaningfully (e.g., "king" - "man" + "woman" ≈ "queen")

- Typical embedding dimensions range from 50-300

- Common algorithms include Word2Vec, GloVe, and FastText

Word embeddings are fundamental to modern NLP, enabling machines to process text by working with these numerical representations rather than raw strings.

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
sents = ['the glass of milk',
             'the glass of juice',
             'the cup of tea',
             'I am a good boy',
             'I am a good developer',
             'understand the meaning of words',
             'your videos are good']

In [4]:
# define vocabulary size
voc_sz = 10000

In [5]:
#implenet OHE
ohe_rep = [one_hot(sent,voc_sz) for sent in sents]
ohe_rep

[[483, 8414, 6099, 8585],
 [483, 8414, 6099, 8337],
 [483, 6760, 6099, 9761],
 [4828, 5175, 9733, 6791, 9070],
 [4828, 5175, 9733, 6791, 9749],
 [7556, 483, 4749, 6099, 9350],
 [2821, 683, 7762, 6791]]

In [7]:
#Word Embedding Representation
from tensorflow.keras.layers import Embedding
# from tensorflow.keras.processing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [9]:
#set maximum sentence length using pad_sequences
sen_len = 10
embedded_sent = pad_sequences(ohe_rep,padding='pre', maxlen=sen_len) #pre ---> add 0s at beginning, post ---> add 0s at end
embedded_sent

array([[   0,    0,    0,    0,    0,    0,  483, 8414, 6099, 8585],
       [   0,    0,    0,    0,    0,    0,  483, 8414, 6099, 8337],
       [   0,    0,    0,    0,    0,    0,  483, 6760, 6099, 9761],
       [   0,    0,    0,    0,    0, 4828, 5175, 9733, 6791, 9070],
       [   0,    0,    0,    0,    0, 4828, 5175, 9733, 6791, 9749],
       [   0,    0,    0,    0,    0, 7556,  483, 4749, 6099, 9350],
       [   0,    0,    0,    0,    0,    0, 2821,  683, 7762, 6791]])

In [10]:
#feature representation
dim = 10

In [13]:
model = Sequential()
model.add(Embedding(voc_sz,dim,input_length=sen_len))
model.compile('adam','mse')



In [14]:
model.summary()

In [15]:
model.predict(embedded_sent)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763ms/step


array([[[-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [-0.00728278,  0.04882381, -0.03211794,  0.0439091 ,
          0.03470316, -0.00919534, -0.0325755 ,  0.01293823,
          0.04332728,  0.01891196],
        [ 0.01987777, -0.01599363, -0.04178799, -0.0

In [16]:
embedded_sent[0]

array([   0,    0,    0,    0,    0,    0,  483, 8414, 6099, 8585])