In [32]:
from tensorflow.keras.preprocessing.text import one_hot

In [33]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world.",
    "ChatGPT is a powerful language model developed by OpenAI.",
    "Data science involves statistics, programming, and domain expertise.",
    "The weather today is sunny with a chance of rain in the evening.",
    "He bought a new laptop for his machine learning project.",
    "Natural language processing enables machines to understand human language.",
    "The stock market showed a significant drop yesterday.",
    "Python is a versatile programming language used in many fields.",
    "She traveled to Japan to experience the cherry blossom season."
]


In [70]:
voc_size = 10000  # Size of the vocabulary

In [71]:
## One-Hot Representation

In [72]:
## one hot representation
one_hot_repre = [ one_hot(words, voc_size) for words in sentences ]

In [73]:
one_hot_repre

[[211, 478, 953, 2729, 8382, 9369, 211, 6140, 7289],
 [3466, 6168, 5239, 6915, 211, 3016],
 [5589, 5239, 2300, 3288, 2945, 5792, 4006, 7036, 5637],
 [4034, 834, 2745, 2321, 8262, 9339, 2163, 7598],
 [211, 1202, 8400, 5239, 6369, 2488, 2300, 6522, 56, 5444, 51, 211, 3975],
 [5916, 5172, 2300, 53, 6895, 2386, 2087, 9539, 3537, 8921],
 [27, 2945, 9059, 4263, 592, 2705, 9349, 922, 2945],
 [211, 1084, 5453, 3748, 2300, 1596, 2392, 4804],
 [332, 5239, 2300, 7018, 8262, 2945, 9287, 51, 8571, 4964],
 [8961, 7143, 2705, 7670, 2705, 3223, 211, 4134, 5712, 3232]]

In [74]:
# Word embedding representation
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential


In [75]:
import numpy as np

In [76]:
## Padding sequences

In [77]:
sen_length = 13 # Maximum length of sentences
embedded_docs = pad_sequences(one_hot_repre, padding = "pre", maxlen=sen_length)   # pre padding means padding at the beginning of the sequence
embedded_docs

array([[   0,    0,    0,    0,  211,  478,  953, 2729, 8382, 9369,  211,
        6140, 7289],
       [   0,    0,    0,    0,    0,    0,    0, 3466, 6168, 5239, 6915,
         211, 3016],
       [   0,    0,    0,    0, 5589, 5239, 2300, 3288, 2945, 5792, 4006,
        7036, 5637],
       [   0,    0,    0,    0,    0, 4034,  834, 2745, 2321, 8262, 9339,
        2163, 7598],
       [ 211, 1202, 8400, 5239, 6369, 2488, 2300, 6522,   56, 5444,   51,
         211, 3975],
       [   0,    0,    0, 5916, 5172, 2300,   53, 6895, 2386, 2087, 9539,
        3537, 8921],
       [   0,    0,    0,    0,   27, 2945, 9059, 4263,  592, 2705, 9349,
         922, 2945],
       [   0,    0,    0,    0,    0,  211, 1084, 5453, 3748, 2300, 1596,
        2392, 4804],
       [   0,    0,    0,  332, 5239, 2300, 7018, 8262, 2945, 9287,   51,
        8571, 4964],
       [   0,    0,    0, 8961, 7143, 2705, 7670, 2705, 3223,  211, 4134,
        5712, 3232]])

In [78]:
## model creation

In [79]:
## feature representation
dim = 14 # Dimension of the embedding vector

In [80]:
model = Sequential()
# not working until forcefully create the model OR train the model then the model.summary won't be a blank---> # model.add(Embedding(voc_size, dim, input_length=sen_length))  # voc_size is the size of the vocabulary, dim is the dimension of the embedding vector, and sen_length is the length of the input sequences
model.add(Embedding(input_dim=voc_size, output_dim=dim))
model.build(input_shape=(None, sen_length)) # Build the model with the specified input shape
model.compile("adam","mse")  # Compile the model with Adam optimizer and mean squared error loss

In [81]:
model.summary()

In [82]:
# Checking the vector representation 
model.predict(embedded_docs)  # Predicting the embedding for the first sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


array([[[ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        [ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        [ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        ...,
        [-0.00799193, -0.03857005, -0.04214312, ...,  0.01145232,
         -0.00798444,  0.02458726],
        [ 0.03384346,  0.04587296, -0.00220511, ..., -0.0243162 ,
         -0.02767168,  0.02989812],
        [-0.01571541, -0.0208672 ,  0.0035185 , ...,  0.03022161,
         -0.0012636 ,  0.04152019]],

       [[ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        [ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        [ 0.01264298,  0.00163507,  0.02676156, ..., -0.00608696,
          0.04610664,  0.03211799],
        ...,
        [ 0.00218096,  0.00834448, -0.00905873, ..., -

In [90]:
# vector representation of the first sentence
model.predict(embedded_docs[0].reshape(1, -1))[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


array([ 0.01264298,  0.00163507,  0.02676156, -0.03849747,  0.02981741,
       -0.04256977, -0.01930399, -0.00249118,  0.03019929,  0.03314808,
        0.03297972, -0.00608696,  0.04610664,  0.03211799], dtype=float32)

In [86]:
embedded_docs[0].reshape(1, -1)

array([[   0,    0,    0,    0,  211,  478,  953, 2729, 8382, 9369,  211,
        6140, 7289]])