In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sent=['the glass of milk',
      'the glass of juice',
      'the cup of tea',
      'I am a good boy',
      'I am a good developer',
      'understand the meaning of words',
      'your videos are good']


In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
#Define the vocabulary size
voc_size=10000

In [5]:
#One Hot representation
one_hot_repr=[one_hot(words,voc_size) for words in sent]
one_hot_repr

[[8433, 1111, 954, 6636],
 [8433, 1111, 954, 3153],
 [8433, 1437, 954, 6355],
 [1774, 7719, 786, 6420, 2570],
 [1774, 7719, 786, 6420, 5818],
 [7402, 8433, 5451, 954, 7236],
 [2329, 6625, 9435, 6420]]

In [6]:
#word embedding representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [7]:
import numpy as np
#we need to make all sentences of equal size or we would not be able to train it in RNN.
#all the words going in RNN would be going for a fixed number of timestamp and based on sentence size
#so using pad_sequences we try to set a maximum sentence length

In [8]:
sent_length=8 #for shorter sentences it will add the remaining number of 0's before or after
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length) #pre means that 0's will be added at the beginning of the sentence

In [9]:
print(embedded_docs)

[[   0    0    0    0 8433 1111  954 6636]
 [   0    0    0    0 8433 1111  954 3153]
 [   0    0    0    0 8433 1437  954 6355]
 [   0    0    0 1774 7719  786 6420 2570]
 [   0    0    0 1774 7719  786 6420 5818]
 [   0    0    0 7402 8433 5451  954 7236]
 [   0    0    0    0 2329 6625 9435 6420]]


In [10]:
#feature representation
dim=10 #dimensions


In [11]:
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')



In [12]:
model.summary()

In [14]:
model.predict(embedded_docs) #every word represented in 10 dimensions

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


array([[[ 0.0028423 ,  0.04546655,  0.02169183,  0.03107656,
         -0.04478921,  0.04552071,  0.00804485, -0.00147621,
         -0.04398173, -0.02940161],
        [ 0.0028423 ,  0.04546655,  0.02169183,  0.03107656,
         -0.04478921,  0.04552071,  0.00804485, -0.00147621,
         -0.04398173, -0.02940161],
        [ 0.0028423 ,  0.04546655,  0.02169183,  0.03107656,
         -0.04478921,  0.04552071,  0.00804485, -0.00147621,
         -0.04398173, -0.02940161],
        [ 0.0028423 ,  0.04546655,  0.02169183,  0.03107656,
         -0.04478921,  0.04552071,  0.00804485, -0.00147621,
         -0.04398173, -0.02940161],
        [-0.01424663,  0.04666208,  0.02873259, -0.03101235,
          0.00652445,  0.03379171,  0.01943352, -0.01731427,
          0.03090117,  0.00306433],
        [ 0.00139553,  0.0324671 , -0.02090414,  0.02538896,
         -0.00372642,  0.01466418,  0.03894671, -0.01240788,
         -0.01768075, -0.01916767],
        [-0.02943122, -0.01284153,  0.0111632 , -0.0

In [15]:
embedded_docs[0]

array([   0,    0,    0,    0, 8433, 1111,  954, 6636], dtype=int32)