In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world.",
    "ChatGPT is a powerful language model developed by OpenAI.",
    "Data science involves statistics, programming, and domain expertise.",
    "The weather today is sunny with a chance of rain in the evening.",
    "He bought a new laptop for his machine learning project.",
    "Natural language processing enables machines to understand human language.",
    "The stock market showed a significant drop yesterday.",
    "Python is a versatile programming language used in many fields.",
    "She traveled to Japan to experience the cherry blossom season."
]


In [3]:
voc_size = 10000  # Size of the vocabulary

In [4]:
## One-Hot Representation

In [5]:
## one hot representation
one_hot_repre = [ one_hot(words, voc_size) for words in sentences ]

In [6]:
one_hot_repre

[[3928, 1546, 7082, 6349, 6509, 3794, 3928, 2840, 1358],
 [2962, 6788, 2916, 8324, 3928, 4807],
 [2706, 2916, 3380, 5101, 9351, 783, 4880, 8271, 4697],
 [4414, 7999, 2931, 5017, 9644, 5583, 7240, 5010],
 [3928, 1024, 143, 2916, 1840, 4823, 3380, 7537, 9084, 8109, 6438, 3928, 434],
 [4358, 4723, 3380, 5305, 1927, 4954, 7451, 1716, 7589, 7256],
 [2172, 9351, 2165, 5950, 3003, 3969, 3153, 5536, 9351],
 [3928, 7031, 2764, 587, 3380, 4178, 1161, 8423],
 [6926, 2916, 3380, 9015, 9644, 9351, 4569, 6438, 3664, 7585],
 [4756, 7611, 3969, 499, 3969, 8363, 3928, 579, 9724, 1683]]

In [7]:
# Word embedding representation
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential


In [8]:
import numpy as np

In [9]:
## Padding sequences

In [11]:
sen_length = 13

In [12]:
embedded_docs = pad_sequences(one_hot_repre, padding = "pre", maxlen=sen_length)   # pre padding means padding at the beginning of the sequence
embedded_docs

array([[   0,    0,    0,    0, 3928, 1546, 7082, 6349, 6509, 3794, 3928,
        2840, 1358],
       [   0,    0,    0,    0,    0,    0,    0, 2962, 6788, 2916, 8324,
        3928, 4807],
       [   0,    0,    0,    0, 2706, 2916, 3380, 5101, 9351,  783, 4880,
        8271, 4697],
       [   0,    0,    0,    0,    0, 4414, 7999, 2931, 5017, 9644, 5583,
        7240, 5010],
       [3928, 1024,  143, 2916, 1840, 4823, 3380, 7537, 9084, 8109, 6438,
        3928,  434],
       [   0,    0,    0, 4358, 4723, 3380, 5305, 1927, 4954, 7451, 1716,
        7589, 7256],
       [   0,    0,    0,    0, 2172, 9351, 2165, 5950, 3003, 3969, 3153,
        5536, 9351],
       [   0,    0,    0,    0,    0, 3928, 7031, 2764,  587, 3380, 4178,
        1161, 8423],
       [   0,    0,    0, 6926, 2916, 3380, 9015, 9644, 9351, 4569, 6438,
        3664, 7585],
       [   0,    0,    0, 4756, 7611, 3969,  499, 3969, 8363, 3928,  579,
        9724, 1683]])

In [13]:
## model creation

In [14]:
## feature representation
dim = 14 # Dimension of the embedding vector

In [15]:
model = Sequential()
# not working until forcefully create the model OR train the model then the model.summary won't be a blank---> # model.add(Embedding(voc_size, dim, input_length=sen_length))  # voc_size is the size of the vocabulary, dim is the dimension of the embedding vector, and sen_length is the length of the input sequences
model.add(Embedding(input_dim=voc_size, output_dim=dim))
model.build(input_shape=(None, sen_length)) # Build the model with the specified input shape
model.compile("adam","mse")  # Compile the model with Adam optimizer and mean squared error loss

In [16]:
model.summary()

In [17]:
# Checking the vector representation 
model.predict(embedded_docs)  # Predicting the embedding for the first sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step


array([[[ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        [ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        [ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        ...,
        [ 0.02223165,  0.0089724 ,  0.04373772, ...,  0.00629544,
         -0.0252335 , -0.01361487],
        [-0.04504142,  0.04022575, -0.03672761, ...,  0.02331519,
          0.00494073, -0.04606806],
        [ 0.03403722,  0.0167384 , -0.02829028, ..., -0.00149249,
         -0.02692946,  0.02934856]],

       [[ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        [ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        [ 0.01582978,  0.01555424,  0.00917007, ...,  0.02813414,
         -0.01793425,  0.02658433],
        ...,
        [ 0.04573316, -0.00276979, -0.02760543, ..., -

In [18]:
# vector representation of the first sentence
model.predict(embedded_docs[0].reshape(1, -1))[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


array([ 0.01582978,  0.01555424,  0.00917007,  0.03752598, -0.01943018,
        0.03193159,  0.01101247, -0.04830978, -0.04918664,  0.00065144,
       -0.02305499,  0.02813414, -0.01793425,  0.02658433], dtype=float32)

In [17]:
embedded_docs[0].reshape(1, -1)

array([[   0,    0,    0,    0, 7087, 3752, 7397, 4716, 4573, 3262, 7087,
        8214,  393]])