In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Example textual document
text = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 
The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  """

# a) DATA PREPARATION
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

print("Vocabulary Size:", vocab_size)

sequence = tokenizer.texts_to_sequences([text])[0]
print("Tokenized Sequence Sample:", sequence[:20])

Vocabulary Size: 103
Tokenized Sequence Sample: [1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23, 3, 44, 11, 24, 45, 46]


In [3]:
# b. Generate Training Data (CBOW)

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i-2], sequence[i-1],
        sequence[i+1], sequence[i+2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContexts:", X_train[:5])
print("Targets:", y_train[:5])


Contexts: [[ 1 38  8  9]
 [38  2  9 39]
 [ 2  8 39 40]
 [ 8  9 40 41]
 [ 9 39 41  2]]
Targets: [ 2  8  9 39 40]


In [4]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))
emb = Embedding(vocab_size, embedding_dim)(input_layer)     # (batch, 4, 8)
ctx_vector = GlobalAveragePooling1D()(emb)                  # (batch, 8) ← TRUE CBOW
output_layer = Dense(vocab_size, activation='softmax')(ctx_vector)

cbow_model = Model(input_layer, output_layer)
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)
print("\nCBOW Training Complete!")


CBOW Training Complete!


In [6]:
# d. Output – Show Word Embeddings

embedding_matrix = cbow_model.layers[1].get_weights()[0]

print("\nWord Embeddings:")
for w, i in list(word_index.items())[:15]:
    print(w, "→", embedding_matrix[i])


Word Embeddings:
the → [ 0.9222583  -0.09754413 -0.8018958  -1.6998988  -0.89543945 -1.4418062
  0.75204635 -0.5592648 ]
of → [ 0.42387807 -1.1995287  -1.1545013  -0.6816685  -0.9287516   1.0326133
  0.37496206  0.3509185 ]
influenza → [ 0.02816984  0.40199527  0.3169415   0.35371774 -0.0247595   0.5721351
  0.85322315 -0.7113122 ]
covid → [-1.2936139   1.1535894  -0.19268778  0.533567    0.16678195 -0.86747366
  0.9883332  -1.3324162 ]
19 → [-1.3353405   0.9549423  -1.0862166   0.00888202 -0.9551337  -0.2555767
  0.33645305 -1.6062005 ]
virus → [-1.0328373   0.6703569  -1.2942431   0.33901945 -0.47979197 -0.8345591
  0.5890001  -1.2150675 ]
for → [-0.95983714  0.9375205  -0.30439636 -0.07241176 -0.28195348  0.09129784
  1.0152482  -0.93621194]
transmission → [ 0.54052144 -0.84713537 -0.5014603  -0.08679602 -0.4304345  -0.8420979
  0.8798068   0.00793319]
is → [-0.4622395  -0.24205776 -1.0121287  -1.1411372  -0.524314    0.24079128
 -0.4467708   0.5301123 ]
to → [-0.76458627 -0.255694

In [7]:
def predict_missing(w1, w2, w3, w4):
    try:
        seq = np.array([word_index[w1], word_index[w2], word_index[w3], word_index[w4]]).reshape(1,4)
    except KeyError as e:
        return f"Word not found in vocabulary: {e}"

    pred = cbow_model.predict(seq, verbose=0)
    pred_id = int(np.argmax(pred))
    return index_word[pred_id]

In [8]:
print("\nPredictions:")
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))


Predictions:
of
virus
