In [None]:
!pip install numpy
!pip install tensorflow
!pip install keras


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define a small corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
]

# Tokenize the corpus to create a vocabulary and sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1        # word_index are non-zero index, +1 to inlcude padding value 0

# Convert sentences to sequences of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]         # starting from 2 words to all the words in the sentences, because it will truncate off the last word for predictor, and use the last truncated word as label.
        input_sequences.append(n_gram_sequence)

# Pad sequences for having the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Prepare predictors and labels
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]    # truncate off the last word from sequencesas for predictor, and use the last truncated word as label.
label = to_categorical(label, num_classes=total_words)

# Build the neural network model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_sequence_len-1))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(predictors, label, epochs=100, verbose=2)


In [None]:
# to understanding the embedding layout output

from keras import backend as K
embedding_layer_output = K.function([model.layers[0].input],[model.layers[0].output])([predictors])[0]
print(embedding_layer_output[0],'\n', embedding_layer_output[1])
'''
========================================================
Embedding is nothing but a learning layer with weights vectors representing each vocaburary word.
Once input is a vectore, then map each element value to the embedding index to find the corresponding weights vector. the embedding_layer_output is a vector of 1 higher dimension.
Once input is a scaler, then map the scaler value to the embedding index to find the corresponding weights vector. the embedding_layer_output is a 1 dimention vector, which is exactly the word embedding.
========================================================
'''

[[-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [ 0.07174324  0.00684089 -0.0886955   0.04066803 -0.0558828   0.06841912
   0.11332682 -0.00641044 -0.12822239  0.04211319]] 
 [[-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.07511616]
 [-0.11833682 -0.10135332  0.11872561 -0.09491817  0.12412032 -0.06265501
  -0.09566948  0.12450426  0.10115235 -0.0751161

In [None]:
# Retrieve and print the embeddings
embeddings = model.layers[0].get_weights()[0]

# Print the word and its corresponding embedding
word_index = tokenizer.word_index
for word, i in word_index.items():
    print(f"Word: {word}, Embedding: {embeddings[i]}")

Word: the, Embedding: [ 0.07174324  0.00684089 -0.0886955   0.04066803 -0.0558828   0.06841912
  0.11332682 -0.00641044 -0.12822239  0.04211319]
Word: sat, Embedding: [ 0.09968039  0.03270097 -0.0893901   0.0577278  -0.11386181  0.11115944
  0.02218372 -0.10860738 -0.07029856  0.03653787]
Word: on, Embedding: [ 0.11307956  0.09688695 -0.03744842  0.06424715 -0.10792501  0.07347568
  0.11631583 -0.01768629 -0.10164903  0.10215276]
Word: cat, Embedding: [ 0.04514093  0.07841094 -0.07238209  0.01869517 -0.11699299  0.10270458
  0.1307155  -0.08503766 -0.03440688  0.0876802 ]
Word: mat, Embedding: [-0.01083907  0.03675335 -0.04226586 -0.03329039 -0.0020213   0.04262426
  0.0178417   0.02759006  0.00922717 -0.01770638]
Word: dog, Embedding: [ 0.11248654 -0.01926392 -0.08886297  0.01913746 -0.06291626  0.1295239
  0.11268586  0.03601092 -0.09354326  0.03238495]
Word: log, Embedding: [ 0.03709486  0.04440585 -0.02342044 -0.02243969 -0.02530271  0.04582239
 -0.04829375  0.04778634 -0.04259117 

In [None]:
def predict_next_word(model, tokenizer, text_sequence):
    """
    Predict the next word based on the input text sequence.

    Parameters:
    - model: The trained Keras model.
    - tokenizer: The tokenizer used to preprocess the text data.
    - text_sequence: A string containing the sequence of text.

    Returns:
    - The predicted next word.
    """
    # Convert the text sequence to a sequence of integers
    sequence = tokenizer.texts_to_sequences([text_sequence])[0]
    # Pad the sequence to match the input shape of the model
    padded_sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
    # Predict the next word (as a probability distribution over the vocabulary)
    predictions = model.predict(padded_sequence, verbose=0)
    # Convert the probabilities to an integer index
    predicted_index = np.argmax(predictions, axis=-1)[0]
    # Map the integer index to the corresponding word
    predicted_word = tokenizer.index_word[predicted_index]

    return predicted_word

# Example usage
# text_sequence = "the cat sat on"
text_sequence = "cats and dogs are"
predicted_word = predict_next_word(model, tokenizer, text_sequence)
print(f"Given the sequence '{text_sequence}', the predicted next word is '{predicted_word}'.")


Given the sequence 'cats and dogs are', the predicted next word is 'friends'.


**Summary**<br>
1. Embedding is nothing but a mapping matrix, which has one row per vocabulary word (think as index), and all other column values of this row as word embedding vector.<br>
To convert a word index to an embedding, the Embeding layer just looks up and return the row embedding vector that corresponds to that workd index.<br>
2. Embedding itself is a learning layer, with weights orginzed in a vector shape that presents word vector.<br>
3. If Embedding layer is in a specific task, then the embedding is customized to that task, the embedding result is highly likely without semantic meaning.<br>
If Embedding layer is in a context prediction task, such as this example or Word2Vec, then the embedding result is enrished with semantic meanings.

