In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
#a1. Direct Text Given then uncomment below line and Comment entire a2 till b (for PS:15)

# corpus = ["The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."]

# a2. Read the corpus from a file and comment entire a1 till a2 (for PS:6 7 8)
corpus = []
with open("input.txt", "r") as f:
    for line in f:
        corpus.append(line.strip())


In [3]:
# b. Generate training data (CBOW representation)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(corpus)
X, y = [], []

for seq in sequences:
    for i, target_word in enumerate(seq):
        for j in range(max(0, i - 2), min(i + 3, len(seq))):
            if i != j:
                X.append([target_word, seq[j]])
                y.append(1)  # Positive context

X = np.array(X)
y = np.array(y)

In [4]:
# c. Train model
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=1, input_length=2),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 1)              103       
                                                                 
 global_average_pooling1d (G  (None, 1)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 2         
                                                                 
Total params: 105
Trainable params: 105
Non-trainable params: 0
_________________________________________________________________


In [5]:
# d. Output
word_embeddings = model.layers[0].get_weights()[0]
for word, index in word_index.items():
    print(f"{word}: {word_embeddings[index]}")

the: [1.4227768]
of: [1.4117383]
influenza: [1.331591]
covid: [1.3318784]
19: [1.3385216]
virus: [1.3108082]
for: [1.3532672]
transmission: [1.2916962]
is: [1.3509651]
to: [1.3126384]
a: [1.3087549]
and: [1.2581351]
between: [1.2195219]
time: [1.2202162]
serial: [1.1659225]
interval: [1.1579407]
than: [1.2086002]
be: [1.2028954]
5: [1.2263665]
days: [1.2158728]
â€“: [1.2230864]
are: [1.2367167]
viruses: [1.08687]
shorter: [1.1330214]
from: [1.1209661]
appearance: [1.0923854]
symptoms: [1.0669883]
while: [1.1353643]
3: [1.1309382]
this: [1.1542575]
that: [1.1199164]
can: [1.1731356]
in: [1.08782]
major: [1.0843703]
driver: [1.0649897]
number: [1.1037824]
2: [1.0498224]
speed: [0.79418427]
an: [0.9069256]
important: [0.9591868]
point: [0.91881233]
difference: [0.92780584]
two: [0.8843826]
has: [0.9125258]
median: [0.96971565]
incubation: [0.9783003]
period: [0.88046646]
infection: [0.95755357]
successive: [0.95078427]
cases: [0.89733]
estimated: [0.8717715]
6: [0.9283257]
means: [0.88846