In [None]:
Continuous Bag of Words (CBOW) Model Implementation

In [None]:
a. Data Preparation

In [1]:
import numpy as np
from collections import defaultdict
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Sample corpus
corpus = [
    "the quick brown fox jumps over the lazy dog",
    "the dog barks at the fox",
    "the fox is quick and the dog is lazy",
    "dogs and foxes are different"
]

# Tokenize corpus to integer sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

sequences = tokenizer.texts_to_sequences(corpus)
vocab_size = len(word2idx) + 1  # +1 for padding (index 0)
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 17


In [None]:
b. Generate Training Data for CBOW

In [2]:
window_size = 2
data = []

for sequence in sequences:
    for idx in range(window_size, len(sequence) - window_size):
        context = sequence[idx - window_size: idx] + sequence[idx + 1: idx + window_size + 1]
        target = sequence[idx]
        data.append((context, target))

print(f"Number of training samples: {len(data)}")

# Prepare inputs and outputs
X = []
y = []

for context, target in data:
    X.append(context)
    y.append(target)

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)
print(f"Sample context (word indices): {X[0]}, target (one-hot): {y[0]}")


Number of training samples: 13
Sample context (word indices): [1 4 2 9], target (one-hot): [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
c. Define and Train the CBOW Model

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
import tensorflow.keras.backend as K

embedding_dim = 10

# Input: context words (window_size*2)
input_words = Input(shape=(window_size * 2,))

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size * 2)(input_words)

# Average embeddings of context words
avg_embedding = Lambda(lambda x: K.mean(x, axis=1))(embedding)

# Output layer: softmax over vocab
output = Dense(vocab_size, activation='softmax')(avg_embedding)

# Build model
model = Model(inputs=input_words, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

# Train
model.fit(X, y, epochs=100, verbose=2)







Epoch 1/100
1/1 - 1s - 552ms/step - accuracy: 0.0769 - loss: 2.8340
Epoch 2/100
1/1 - 0s - 31ms/step - accuracy: 0.0769 - loss: 2.8323
Epoch 3/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8306
Epoch 4/100
1/1 - 0s - 34ms/step - accuracy: 0.0769 - loss: 2.8289
Epoch 5/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8272
Epoch 6/100
1/1 - 0s - 34ms/step - accuracy: 0.0769 - loss: 2.8255
Epoch 7/100
1/1 - 0s - 34ms/step - accuracy: 0.0769 - loss: 2.8238
Epoch 8/100
1/1 - 0s - 33ms/step - accuracy: 0.0769 - loss: 2.8221
Epoch 9/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8204
Epoch 10/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8187
Epoch 11/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8171
Epoch 12/100
1/1 - 0s - 33ms/step - accuracy: 0.0769 - loss: 2.8154
Epoch 13/100
1/1 - 0s - 32ms/step - accuracy: 0.0769 - loss: 2.8137
Epoch 14/100
1/1 - 0s - 36ms/step - accuracy: 0.1538 - loss: 2.8120
Epoch 15/100
1/1 - 0s - 33ms/step - accuracy: 0.1538 - l

<keras.src.callbacks.history.History at 0x19aee2a46e0>

In [None]:
d. Output: Testing the Model's Prediction

In [4]:
def predict_word(context_words):
    context_seq = [word2idx.get(word, 0) for word in context_words]
    context_seq = np.array(context_seq).reshape(1, -1)
    pred = model.predict(context_seq)
    predicted_index = np.argmax(pred)
    return idx2word.get(predicted_index, "Unknown")

# Example test
test_context = ['the', 'quick', 'jumps', 'over']  # context words around a target
predicted_word = predict_word(test_context)
print(f"Given context words {test_context}, predicted target word is '{predicted_word}'")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Given context words ['the', 'quick', 'jumps', 'over'], predicted target word is 'and'
