# How to represent word in form of vector

import required libraries

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

Encode using unique number

In [2]:
# Encode set of reviews by assign unique number for each word
reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

vocab_size=20
lc=[ one_hot(i, vocab_size) for i in reviews]
print(lc)

[[15, 5], [3, 2], [2, 5], [11, 1, 9], [6, 1, 1], [19, 5], [9, 1, 10], [13, 2], [13, 6], [1, 11]]


In [3]:
# Padding each vector to be the length is 4
max_length = 4
padded_reviews = pad_sequences(lc, maxlen=max_length, padding='post')
print(padded_reviews)

[[15  5  0  0]
 [ 3  2  0  0]
 [ 2  5  0  0]
 [11  1  9  0]
 [ 6  1  1  0]
 [19  5  0  0]
 [ 9  1 10  0]
 [13  2  0  0]
 [13  6  0  0]
 [ 1 11  0  0]]


Supervisied learning word embedding

In [4]:
# Using supervisied learning word-embedding
model = Sequential()
model.add(Embedding(vocab_size, 5, input_length=max_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [5]:
# Split into 'x' and 'y'
sentiment = np.array([1,1,1,1,1,0,0,0,0,0])
x = padded_reviews
y = sentiment

In [6]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 5)              100       
                                                                 
 flatten (Flatten)           (None, 20)                0         
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 121 (484.00 Byte)
Trainable params: 121 (484.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [7]:
# train the model
model.fit(x, y, epochs=50, verbose=0)

<keras.src.callbacks.History at 0x79de80301e40>

In [8]:
# Evalute the model
model.evaluate(x,y)



[0.6409926414489746, 0.800000011920929]

In [9]:
# Get embedding matrix
weights = model.get_layer('embedding').get_weights()[0]
weights

array([[ 0.09080291,  0.04077743,  0.07048608, -0.09536643, -0.06410235],
       [ 0.04564615,  0.08861034,  0.09281723,  0.02030244,  0.0847896 ],
       [-0.03653952, -0.03867165, -0.06675962, -0.0283529 , -0.05561654],
       [-0.06883483, -0.0596122 , -0.02360052, -0.01153345, -0.06395859],
       [ 0.01962403, -0.03958964, -0.04344456,  0.04233046,  0.03015471],
       [ 0.00277658,  0.05412876,  0.03600292, -0.06823713,  0.0716436 ],
       [-0.03855876, -0.01912527, -0.00789569, -0.09639236, -0.03363668],
       [-0.02408277, -0.03584455, -0.01871102,  0.00058589,  0.03086111],
       [-0.04019791,  0.03031926, -0.02510473, -0.0073264 ,  0.04797361],
       [ 0.00056963, -0.008749  , -0.0964853 ,  0.05038533,  0.09352905],
       [ 0.01957663,  0.03023343,  0.048843  , -0.01209556, -0.01905228],
       [-0.07536631, -0.05394083, -0.02035267, -0.05196862, -0.08270742],
       [-0.00458559, -0.04629245, -0.0375168 , -0.00401822,  0.02586285],
       [ 0.02078405,  0.09084024,  0.0

In [12]:
# Check "Amazing" and "Nice" words
print(weights[15])
print(weights[3])

[-0.04804495 -0.02424912 -0.07106739 -0.03502652 -0.0979435 ]
[-0.06883483 -0.0596122  -0.02360052 -0.01153345 -0.06395859]


- Two words are very similar

In [13]:
# Check "Good" and "Amazing" words
print(weights[13])
print(weights[3])

[0.02078405 0.09084024 0.04138005 0.02842173 0.04474073]
[-0.06883483 -0.0596122  -0.02360052 -0.01153345 -0.06395859]


- Two words are not similar.