In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

In [4]:
# Creating a small sample text corpus for training the CBOW model.
corpus = [
    "I like deep learning",
    "I like natural language processing",
    "I enjoy machine learning",
    "deep learning is fun",
    "natural language processing is challenging"
]

# Initializing the tokenizer to convert text to numerical tokens.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)  # Learn the vocabulary in the corpus
total_words = len(tokenizer.word_index) + 1  # Adding 1 because of zero-index padding

# Convert each sentence into a list of word indices based on the learned vocabulary.
sequences = tokenizer.texts_to_sequences(corpus)

In [5]:
# Define a function to create input-output pairs for CBOW model.
# In CBOW, the model predicts a word based on its context (surrounding words).
# For example, in the sentence "I like deep learning", the target could be "deep" and the context could be ["I", "like", "learning"].

def generate_cbow_data(sequences, window_size=2):
    contexts = []
    targets = []
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            contexts.append(context)
            targets.append(target)
    return np.array(contexts), np.array(targets)

# Generate the input and output pairs for training
window_size = 2  # Defining the context window size
X, y = generate_cbow_data(sequences, window_size)

In [6]:
# In CBOW, we use an embedding layer to learn word embeddings.
# The model takes context words as input and predicts the target word.

embedding_dim = 50  # Dimension of the word embeddings

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=window_size * 2))  # Embedding layer
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_dim,)))  # Average embeddings of context words
model.add(Dense(total_words, activation='softmax'))  # Dense layer to output a probability for each word in vocabulary

In [7]:
# Compile the model with a suitable loss function and optimizer
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 50)             650       
                                                                 
 lambda (Lambda)             (None, 50)                0         
                                                                 
 dense (Dense)               (None, 13)                663       
                                                                 
Total params: 1,313
Trainable params: 1,313
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Fit the model to the training data (input context words and target words)

epochs = 100  # Number of training epochs

# Train the model on the input-output pairs generated
history = model.fit(X, y, epochs=epochs, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [9]:
# Define a function to predict the target word given a context of words

def predict_target_word(context, tokenizer, model):
    context_seq = tokenizer.texts_to_sequences([context])[0]
    padded_context = pad_sequences([context_seq], maxlen=window_size*2, padding='post')
    prediction = model.predict(padded_context)
    predicted_word_index = np.argmax(prediction)
    predicted_word = tokenizer.index_word[predicted_word_index]
    return predicted_word

# Test with a sample context (change "like deep" to see different predictions)
sample_context = "like deep"
predicted_word = predict_target_word(sample_context, tokenizer, model)
print(f"Predicted word for context '{sample_context}': {predicted_word}")


Predicted word for context 'like deep': natural
