In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Sample corpus
corpus = [
    "I love deep learning and neural networks",
    "deep learning is amazing and powerful",
    "I love AI and its applications",
    "AI is the future of technology",
    "Natural language processing is a branch of AI"
]

#Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
words = tokenizer.word_docs
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

#Convert each word in the sentence to its corresponding index
sequences = tokenizer.texts_to_sequences(corpus)

# Create CBOW context-target pairs
window_size = 2
context_target_pairs = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        context = seq[i - window_size:i] + seq[i + 1:i + 1 + window_size]
        target = seq[i]
        context_target_pairs.append((context, target))


# Prepare inputs and outputs
X, y = zip(*context_target_pairs)
X = np.array(X)
y = np.array(y)
y = to_categorical(y, num_classes=vocab_size)  # One-hot encoding

In [6]:
# Define CBOW Model
embedding_dim = 50  # Size of word embeddings
input_layer = Input(shape=(window_size * 2,), name="input")
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=window_size * 2, name="embedding")(input_layer)
context_vector = Lambda(lambda x: tf.reduce_mean(x, axis=1), name="context_vector")(embedding_layer)
output_layer = Dense(vocab_size, activation='softmax', name="output")(context_vector)

cbow_model = Model(input_layer, output_layer)
cbow_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
cbow_model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 852ms/step - accuracy: 0.0000e+00 - loss: 3.1385
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0769 - loss: 3.1327
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0769 - loss: 3.1270
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.1538 - loss: 3.1212
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.1538 - loss: 3.1154
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2308 - loss: 3.1096
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.3077 - loss: 3.1039
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.3846 - loss: 3.0981
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x78bcb0c7b790>

In [7]:
# Save the embeddings
embeddings = cbow_model.get_layer("embedding").get_weights()[0]
word_embeddings = {word: embeddings[idx] for word, idx in word_index.items()}