In [9]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing.sequence import make_sampling_table

In [21]:
# Example textual document
text = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 
The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  """

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}

vocab_size = len(word_index) + 1
print("\nVocabulary Size:", vocab_size)

# Convert text to sequence of indices
sequence = tokenizer.texts_to_sequences([text])[0]
print("\nWord Index Sequence:", sequence)


Vocabulary Size: 103

Word Index Sequence: [1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23, 3, 44, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 5, 6, 1, 15, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 29, 20, 30, 53, 31, 3, 32, 54, 55, 17, 4, 5, 56, 8, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 7, 3, 33, 65, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 2, 8, 1, 83, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 12, 37, 19, 7, 4, 5, 6, 91, 17, 7, 3, 92, 93, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100, 101, 102]


In [22]:
# b. Generate Training Data (CBOW)
# --------------------------------------

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2], sequence[i - 1],
        sequence[i + 1], sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext Samples:\n", X_train[:5])
print("\nTarget Samples:\n", y_train[:5])


Context Samples:
 [[ 1 38  8  9]
 [38  2  9 39]
 [ 2  8 39 40]
 [ 8  9 40 41]
 [ 9 39 41  2]]

Target Samples:
 [ 2  8  9 39 40]


In [23]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))

# Embedding layer without deprecated argument
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

# Average / combine embeddings
aggregated = Dense(embedding_dim, activation='linear')(embedding_layer)
aggregated = Flatten()(aggregated)

output_layer = Dense(vocab_size, activation='softmax')(aggregated)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)
print("\nCBOW Model Training Complete!")


CBOW Model Training Complete!


In [24]:
# d. Output – Show Word Embeddings

from tensorflow.keras.layers import Embedding

# Get embedding layer safely
embedding_layer = [layer for layer in cbow_model.layers if isinstance(layer, Embedding)][0]
weights = embedding_layer.get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(word, "→", weights[idx])


Word Embeddings:
the → [ 0.381921   -0.3827612   0.28232676 -0.32462665 -0.10854999 -0.27674514
  0.20162693  0.22187145]
of → [ 0.19765933 -0.07328451  0.20110995 -0.51838523 -0.6149985  -0.30306268
 -0.990077    0.18278612]
influenza → [-0.4614482   0.17929237 -0.47589657  0.01708895 -0.35131395 -0.25184318
 -0.6732528  -0.36920613]
covid → [-0.62282014 -0.6667728  -0.75405055  0.39089897 -0.427028   -0.5551343
  0.2281577  -0.74599177]
19 → [ 0.6063177  -0.3705299   0.6056767  -0.47086182 -0.6774352  -0.30083582
 -0.7296897   0.6954413 ]
virus → [ 0.42701447 -0.6432322   0.43086287 -0.4503687   0.00320087 -0.545937
  0.16214457  0.31596908]
for → [-0.38781494  0.15050043 -0.37849852  0.06228952 -0.32261467 -0.27666324
 -0.8479714   0.18981236]
transmission → [ 0.31639302 -0.38352185  0.3605921  -0.49475974 -0.41403297 -0.01651763
 -0.26824364  0.46866363]
is → [ 0.41584423 -0.01606878  0.7229365   0.15565501  0.579466    0.46817452
  0.46909198  0.17961913]
to → [ 0.12863766 -0.619

In [25]:
def predict_missing(w1, w2, w3, w4):
    context = [w1, w2, w3, w4]
    seq = [word_index[w] for w in context]
    seq = np.array(seq).reshape(1,4)
    
    pred = cbow_model.predict(seq)
    pred_id = np.argmax(pred)
    return index_word[pred_id]


In [27]:
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
than
