In [1]:
# a. Data Preparation

import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Replace with your own text/document
text = """
natural language processing is a field of artificial intelligence
that focuses on understanding and generating human language
"""

In [3]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

# Convert to sequence of integers
sequence = tokenizer.texts_to_sequences([text])[0]

print("Vocabulary Size:", vocab_size)
print("Sequence:", sequence)


Vocabulary Size: 17
Sequence: [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1]


In [4]:
# b. Generate Training Data (CBOW)

window_size = 2   # context window
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2],
        sequence[i - 1],
        sequence[i + 1],
        sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext samples:", X_train[:5])
print("Target samples:", y_train[:5])


Context samples: [[2 1 4 5]
 [1 3 5 6]
 [3 4 6 7]
 [4 5 7 8]
 [5 6 8 9]]
Target samples: [3 4 5 6 7]


In [5]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

hidden = Dense(embedding_dim, activation="linear")(embedding_layer)
hidden = Flatten()(hidden)

output_layer = Dense(vocab_size, activation="softmax")(hidden)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)

print("\nCBOW Model Training Completed!")


CBOW Model Training Completed!


In [6]:
# d. Output – Word Embeddings

embedding_weights = cbow_model.layers[1].get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(f"{word} → {embedding_weights[idx]}")


Word Embeddings:
language → [ 0.34864694 -0.28352225 -0.32954106 -0.330175   -0.32579336  0.02985558
  0.31340995  0.12374063]
natural → [-0.19336072 -0.1652574   0.18348812  0.23212071  0.08073027 -0.13541135
  0.25127044 -0.16602725]
processing → [ 0.19909224  0.25074974 -0.24702932 -0.14326274 -0.14292747 -0.02806257
 -0.26737836  0.23708367]
is → [ 0.37902024  0.03086817 -0.05386233 -0.3213639  -0.29289034  0.31723383
 -0.21881221  0.38232526]
a → [-0.24571685 -0.27222005  0.2659032   0.18513367  0.05762469 -0.03387557
 -0.23578599 -0.09254494]
field → [-0.39727536  0.21746983  0.38364807  0.400948    0.27869764  0.09220381
 -0.19249654 -0.33858314]
of → [ 0.3597302  -0.26245454 -0.07020467 -0.06801084  0.09115064  0.26831895
  0.27479678  0.30538726]
artificial → [ 0.11438134 -0.06007174 -0.2594983   0.05602751 -0.23948148 -0.37301543
  0.1384317   0.14984472]
intelligence → [-0.1330566  -0.32150668  0.27159727  0.08834978  0.10839998  0.10225257
  0.22744845 -0.12984863]
that → 