In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
print(tf.__version__)

2.19.0


In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
sent = ['the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good']

In [None]:
#before we convert this sentences to vectors, we need to initialize vocabulary size
vocab_size = 10000

In [None]:
#one hot representation
onehot_repr = [one_hot(words,vocab_size) for words in sent]
onehot_repr

[[5073, 8192, 2930, 6172],
 [5073, 8192, 2930, 5018],
 [5073, 9521, 2930, 9838],
 [1481, 9680, 276, 1062, 2051],
 [1481, 9680, 276, 1062, 6012],
 [8304, 5073, 9681, 2930, 6298],
 [8981, 5151, 4170, 1062]]

In [None]:
#Embedding Layer Representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [None]:
#currently the one hot vector generated is not of same size, so inorder to have them of same size, we need to do padding,
#either post or pre padding
sent_length = 8
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0,    0,    0,    0, 5073, 8192, 2930, 6172],
       [   0,    0,    0,    0, 5073, 8192, 2930, 5018],
       [   0,    0,    0,    0, 5073, 9521, 2930, 9838],
       [   0,    0,    0, 1481, 9680,  276, 1062, 2051],
       [   0,    0,    0, 1481, 9680,  276, 1062, 6012],
       [   0,    0,    0, 8304, 5073, 9681, 2930, 6298],
       [   0,    0,    0,    0, 8981, 5151, 4170, 1062]], dtype=int32)

In [None]:
#currently this one hot vector has index size of words from the vocabulary
#but we need to convert each word into a feature representation of some dimensions
dim = 10

In [None]:
model = Sequential()
model.add(Embedding(vocab_size,10))
model.compile('adam','mse')
model.summary()

In [None]:
#lets see our first sentence represent in actual dimension
embedded_docs[0]

array([   0,    0,    0,    0, 5073, 8192, 2930, 6172], dtype=int32)

In [None]:
model.predict(embedded_docs[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 835ms/step


array([[-0.04717533,  0.02854638,  0.0309144 , -0.01540024, -0.02333039,
        -0.01229785,  0.00540304, -0.04702029, -0.02531394,  0.01190367],
       [-0.04717533,  0.02854638,  0.0309144 , -0.01540024, -0.02333039,
        -0.01229785,  0.00540304, -0.04702029, -0.02531394,  0.01190367],
       [-0.04717533,  0.02854638,  0.0309144 , -0.01540024, -0.02333039,
        -0.01229785,  0.00540304, -0.04702029, -0.02531394,  0.01190367],
       [-0.04717533,  0.02854638,  0.0309144 , -0.01540024, -0.02333039,
        -0.01229785,  0.00540304, -0.04702029, -0.02531394,  0.01190367],
       [ 0.0412447 ,  0.01240664, -0.00849717,  0.00483059,  0.04107204,
        -0.04207269, -0.01366787,  0.03880553, -0.03919999, -0.02411729],
       [ 0.00324317,  0.00621642, -0.03041345,  0.0028775 ,  0.04864969,
        -0.03176065,  0.04498279,  0.01525717,  0.0376134 ,  0.01035155],
       [ 0.02053889,  0.00499719,  0.01737097,  0.00024152, -0.04647787,
        -0.00108894, -0.01013676,  0.03032419

In [None]:
#here in this , each word is now represented in 10 dimensions
#this model here actually converts our index represented OHE into vectors using random numbers of 10 dimensions, which when trained
#and weights get updated during backpropagation updates the values accordingly.
#now at this stage, it has transformed the indices OHE to vectors, which just contains the initial values.
#But it wont just train itself.
#Based on our requirement now, we need to add more layers and train accordingly.