In [2]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
# REPLACE THIS WITH YOUR TEXTUAL DOCUMENT 3
text = """
    Artificial intelligence and machine learning are transforming industries. 
    Companies use data to train models that can understand language, 
    make predictions, and automate complex tasks.
"""
# PUT YOUR DOCUMENT 3 ABOVE ↑↑↑

In [4]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

sequence = tokenizer.texts_to_sequences([text])[0]

print("Vocabulary Size:", vocab_size)
print("Sequence:", sequence)

Vocabulary Size: 24
Sequence: [2, 3, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 21, 22, 23]


In [5]:
# b. Generate Training Data (CBOW)

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2],
        sequence[i - 1],
        sequence[i + 1],
        sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext samples:\n", X_train[:5])
print("\nTarget samples:\n", y_train[:5])


Context samples:
 [[2 3 4 5]
 [3 1 5 6]
 [1 4 6 7]
 [4 5 7 8]
 [5 6 8 9]]

Target samples:
 [1 4 5 6 7]


In [6]:
# c. Train Model (CBOW)

embedding_dim = 8

input_layer = Input(shape=(4,))
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

hidden = Dense(embedding_dim, activation="linear")(embedding_layer)
hidden = Flatten()(hidden)

output_layer = Dense(vocab_size, activation="softmax")(hidden)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)

print("\nCBOW Model Training Complete!")


CBOW Model Training Complete!


In [7]:
# d. Output – Word Embeddings

from tensorflow.keras.layers import Embedding

embedding_layer_obj = [layer for layer in cbow_model.layers if isinstance(layer, Embedding)][0]
weights = embedding_layer_obj.get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(f"{word} → {weights[idx]}")


Word Embeddings:
and → [ 0.33388063  0.23897627 -0.00245166 -0.34911847  0.19323072 -0.1758215
 -0.30447924  0.3047515 ]
artificial → [ 0.10225158 -0.18491912 -0.22186331  0.09288968 -0.16419594 -0.22215068
  0.18461749 -0.26058343]
intelligence → [-0.3011102   0.27340502  0.29981354  0.20266679 -0.30358985  0.13303886
 -0.28277218  0.11462356]
machine → [-0.17299399  0.3010616   0.32244766  0.2808829  -0.25201517  0.2042182
 -0.04099383 -0.19004552]
learning → [ 0.32577705 -0.10726667 -0.16211052 -0.4324983   0.32738975  0.01339934
  0.26947537  0.22899538]
are → [ 0.40127534  0.35410333  0.33594838 -0.23192279  0.29714763  0.37725812
 -0.1955886   0.45401743]
transforming → [-0.26276848  0.42021564  0.3244319   0.3972211  -0.396914    0.10345405
  0.29359192 -0.3238096 ]
industries → [-0.29452285 -0.01761549 -0.05606993  0.10231233 -0.2870479  -0.18836977
  0.26843014 -0.27133512]
companies → [-0.37942615 -0.08160608 -0.12732147  0.11819732 -0.19914638 -0.25165638
  0.25201163 -0.34

In [8]:
# EXTRA SECTION: CHECK IF CBOW MODEL IS WORKING
# =====================================================

from numpy import dot
from numpy.linalg import norm

# Similarity function
def similarity(w1, w2):
    v1 = weights[word_index[w1]]
    v2 = weights[word_index[w2]]
    return dot(v1, v2) / (norm(v1) * norm(v2))

print("\nChecking Word Similarity:")
print("Similarity(learning, intelligence):", similarity("learning", "intelligence"))
print("Similarity(models, predictions):", similarity("models", "predictions"))
print("Similarity(data, companies):", similarity("data", "companies"))


# Predict the missing target word from context
context_words = ["machine", "learning", "are", "transforming"]

context_ids = np.array([[word_index[w] for w in context_words]])

pred = cbow_model.predict(context_ids)
predicted_word = index_word[np.argmax(pred)]

print("\nContext:", context_words)
print("Predicted Missing Word:", predicted_word)


Checking Word Similarity:
Similarity(learning, intelligence): -0.7753357
Similarity(models, predictions): -0.45635945
Similarity(data, companies): 0.85118765
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

Context: ['machine', 'learning', 'are', 'transforming']
Predicted Missing Word: can
