In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
#a1. Direct Text Given then uncomment below line and Comment entire a2 till b (for PS:15)

# corpus = ["The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."]

# a2. Read the corpus from a file and comment entire a1 till a2 (for PS:6 7 8)
corpus = []
with open("input.txt", "r") as f:
    for line in f:
        corpus.append(line.strip())


In [12]:
# b. Generate training data (CBOW representation)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(corpus)
X, y = [], []

for seq in sequences:
    for i, target_word in enumerate(seq):
        for j in range(max(0, i - 2), min(i + 3, len(seq))):
            if i != j:
                X.append([target_word, seq[j]])
                y.append(1)  # Positive context
                X.append([target_word, np.random.choice(list(word_index.values()))])
                y.append(0)

X = np.array(X)
y = np.array(y)

In [13]:
# c. Train model
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=1, input_length=2),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 2, 1)              103       
                                                                 
 global_average_pooling1d_4   (None, 1)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 1)                 2         
                                                                 
Total params: 105
Trainable params: 105
Non-trainable params: 0
_________________________________________________________________


In [14]:
# d. Output
word_embeddings = model.layers[0].get_weights()[0]
for word, index in word_index.items():
    print(f"{word}: {word_embeddings[index]}")

the: [0.44851264]
of: [0.42343643]
influenza: [0.38433146]
covid: [0.28832778]
19: [0.23814265]
virus: [0.32547817]
for: [0.22984138]
transmission: [0.17172377]
is: [0.28095442]
to: [0.2365177]
a: [0.1466575]
and: [0.16988114]
between: [0.0302819]
time: [0.21945351]
serial: [-0.05483583]
interval: [0.13636404]
than: [0.16407128]
be: [0.18330681]
5: [0.15795103]
days: [0.2561155]
â€“: [0.2685065]
are: [-0.00483931]
viruses: [0.00557107]
shorter: [-0.03386703]
from: [0.27320492]
appearance: [0.1592428]
symptoms: [-0.09676122]
while: [0.04291558]
3: [0.04414548]
this: [-0.07349396]
that: [0.18115728]
can: [-0.16558623]
in: [-0.07215121]
major: [0.07390355]
driver: [0.22065672]
number: [-0.1431031]
2: [0.05324086]
speed: [-0.22872311]
an: [-0.18698213]
important: [-0.27105644]
point: [-0.11256487]
difference: [-0.44613817]
two: [-0.532095]
has: [0.17004985]
median: [0.0569774]
incubation: [-0.38547307]
period: [0.08605439]
infection: [-0.5099756]
successive: [-0.23716141]
cases: [-0.293739