Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps: a. Data preparation b. Generate training data c. Train model d. Output

In [None]:
# ============================================
#   1) LOAD TEXT FROM DOCUMENT
# ============================================
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

# download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# read text from file
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Loaded text length:", len(text))

# ============================================
#   2) PREPROCESS TEXT
# ============================================
sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))

def preprocess(sentences):
    words = []
    for sentence in sentences:
        tokenized = word_tokenize(sentence.lower())
        filtered = [w for w in tokenized if w.isalnum() and w not in stop_words]
        words.extend(filtered)
    return words

words = preprocess(sentences)
print("Total words:", len(words))

# create vocabulary
vocab = list(set(words))
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# mapping words <-> indices
word_to_index = {w: i for i, w in enumerate(vocab)}
index_to_word = {i: w for w, i in word_to_index.items()}

# convert all words to indices
indexed_words = [word_to_index[w] for w in words]

# ============================================
#   3) GENERATE CONTEXT–TARGET PAIRS (CBOW)
# ============================================
def generate_cbow_pairs(words_idx, window=2):
    #Predicts the center (target) word from surrounding (context) words.
    context_target = []
    for i in range(window, len(words_idx) - window):
        context = words_idx[i-window:i] + words_idx[i+1:i+1+window]
        target = words_idx[i]
        context_target.append((context, target))
    return context_target

pairs = generate_cbow_pairs(indexed_words, window=2)

print("Example Pair:", pairs[0])  # (context_indices, target_index)

# split into X, y
# contexts = 2D array where each row contains 4 context word indices
contexts, targets = zip(*pairs)
contexts = np.array(contexts)
targets = to_categorical(targets, num_classes=vocab_size)

# ============================================
#   4) BUILD CBOW MODEL
# ============================================
embedding_dim = 20  # you can increase

model = Sequential()
# input_dim=vocab_size — vocabulary size (total unique words)
# output_dim=embedding_dim — each word becomes a 20-dimensional vector
# input_length=4 — expects 4 word indices (2 left context + 2 right context)
# What it does:

# Converts each word index (0–vocab_size) into a dense embedding vector (20-dim)
# Input shape: (batch_size, 4) → indices of 4 context words
# Output shape: (batch_size, 4, 20) → 4 words, each as 20-dim vector
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=4))  # 2 left, 2 right
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  # average embeddings
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
#Adam optimizer — adapts learning rate automaticall
# ============================================
#   5) TRAIN
# ============================================
model.fit(contexts, targets, epochs=30, batch_size=64)

# ============================================
#   6) TEST: PREDICT A MISSING WORD
# ============================================
def predict_missing_word(context_words):
    ctx_idx = [word_to_index[w] for w in context_words]
    ctx_idx = np.array(ctx_idx).reshape(1,4)
    pred = model.predict(ctx_idx)[0]
    predicted_word = index_to_word[np.argmax(pred)]
    return predicted_word

print("\nPrediction Example:")
# The speed of transmission is an important point of difference between the two viruses
# Influenza has a shorter median incubation period (the time from infection to appearance of symptoms)
print(predict_missing_word(["transmission", "important", "point", "difference"]))
print(predict_missing_word(["incubation", "period", "infection", "symptoms"]))


Loaded text length: 1193
Total words: 88
Vocabulary size: 60
Example Pair: ([32, 52, 19, 42], 43)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0342 - loss: 4.0937
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0657 - loss: 4.0903
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.0657 - loss: 4.0879
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0868 - loss: 4.0856
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1262 - loss: 4.0831
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1238 - loss: 4.0812
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1238 - loss: 4.0787
Epoch 8/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.1947 - loss: 4.0761 
Epoch 9/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
serial
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
driver


Input contexts: (batch, 4) 
  ↓
Embedding → (batch, 4, 20)  [4 words, each 20-dim]
  ↓
Lambda/Mean → (batch, 20)   [averaged context vector]
  ↓
Dense Softmax → (batch, vocab_size)  [probability per word]
  ↓
argmax → predicted word index

The Continuous Bag of Words (CBOW) model is a neural embedding technique used to learn dense vector representations of words based on their surrounding context. The goal of CBOW is to predict a target word given its nearby context words. During training, the model takes multiple context words (within a fixed window size), converts them into embeddings, averages these embeddings to generate a context representation, and then feeds this into a softmax classifier to predict the center word. By minimizing categorical cross-entropy loss over large corpora, CBOW learns semantic relationships, where words appearing in similar contexts end up with similar embedding vectors. This enables the model to capture syntactic and semantic meaning such as analogies, similarity, and clustering. CBOW is computationally efficient and forms the foundation of the Word2Vec architecture used in many NLP tasks.

✅ 1. Core Theory: What is CBOW? (Exam-Perfect Explanation)

CBOW (Continuous Bag of Words) is a Word2Vec model that:

Takes context words around a missing word

And predicts the target word in the center

Learns dense vector embeddings by training on many context–target pairs

Uses an embedding layer to convert words to vectors

Uses averaging of context embeddings

Uses softmax to predict the missing word

Learns semantic meaning: words used in similar contexts get similar vectors

In simple terms:
Feed the context → model guesses the missing word.
The model becomes good at this, and that’s how it learns word embeddings.

✅ 2. Code Explanation (Very Detailed & Clean)

Let’s go section-by-section.

✅ 1) Loading Text
text = open("CBOW.txt").read()


Reads the textual document.

NLTK is used for tokenization and stopwords.

✅ 2) Preprocessing the Text
Steps done:

Sentence tokenization

Word tokenization

Lowercasing

Remove stopwords (the, is, are…)

Remove non-alphanumeric junk

Create a clean list of useful words

This builds your corpus.

Vocabulary Creation
vocab = list(set(words))


Makes list of all unique words.

Mapping word ↔ index created.

This is essential for embedding lookup.

✅ 3) Generate CBOW Training Data

CBOW works like this:

context → target


If window = 2:

context: w1 w2   [target word]   w3 w4


The code:

for i in range(window, len(words_idx)-window):
    context = words_idx[i-window:i] + words_idx[i+1:i+1+window]
    target  = words_idx[i]


So for:

I love deep learning models


with window 2:

context: ["I", "love", "learning", "models"]
target:  "deep"


Then contexts → array of size (N,4)
Targets → one-hot vectors of vocabulary size.

This is the most important exam concept.

✅ 4) Building CBOW Model

Architecture:

✅ Embedding Layer
Embedding(vocab_size, 20)


Converts each word index → dense embedding vector (20-dim)

✅ Lambda (Mean) Layer
tf.reduce_mean(x, axis=1)


Takes 4 embeddings

Averages them

Produces Context Vector
This is the CBOW trick: "Bag of Words" → order doesn’t matter.

✅ Dense Softmax Layer
Dense(vocab_size, activation='softmax')


Predicts which word in the vocabulary is the center word.

✅ Loss + Optimizer

Uses categorical cross entropy because output = probability distribution.

Adam optimizer for stable training.

✅ 5) Training
model.fit(contexts, targets)


The model learns to guess the missing center word.

During training, embeddings get meaningful.

✅ 6) Predict Missing Word

You give 4 context words:

["transmission", "important", "viruses", "difference"]


Model outputs the most likely missing center word.

✅ 3. Most Important Terminologies (Exam Gold)
Term	Meaning
CBOW	Predicts center word using context words
Context Window	Number of words around the target
Vocabulary	All unique words in corpus
Embedding layer	Converts words → dense vectors
One-hot encoding	Target word in vector form
Softmax	Predicts probability of each word
Context averaging	CBOW’s main mechanism
Training pair	(context, target)
Word2Vec	Embedding learning framework
Latent Embedding Space	Meaningful vector space of words