In [2]:

#from keras.preprocessing import text
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [3]:
data =  """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 
"""
dl_data = data.split()

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)

word2id = tokenizer.word_index #ishant : 1 "sharayu" :2
word2id['PAD'] = 0

id2words = {v:k for k,v in word2id.items()}

wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size =2

print("Vocabulary size: ", vocab_size)
print("Vocabulary sample: ", list(word2id.items())[:10])

Vocabulary size:  81
Vocabulary sample:  [('the', 1), ('of', 2), ('transmission', 3), ('virus', 4), ('is', 5), ('influenza', 6), ('a', 7), ('to', 8), ('covid', 9), ('19', 10)]


In [5]:
def generate_context_word_pair(corpus,window_size, vocab_size):
    context_length = window_size*2

    for words in corpus:
        sentence_length = len(words) 

        for index, word in enumerate(words): # [1 0 2 4 5]
            context_words = []
            label_word = []
            start = index-window_size
            end = index+window_size +1

            context_words.append([words[i]
                                  for i in range(start,end)
                                  if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            
            x = pad_sequences(context_words, maxlen= context_length)
            y = to_categorical(label_word,vocab_size)
            yield (x,y)

In [7]:
from keras.layers import Input
cbow = Sequential()
cbow.add(Input(shape=(window_size * 2,)))
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size))
cbow.add(Lambda(lambda x: K.mean(x,axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation = "softmax"))
cbow.compile(loss = "categorical_crossentropy", optimizer = "rmsprop")
print(cbow.summary())

None


In [6]:
for epoch in range(1,6):
    loss = 0

    for x,y in generate_context_word_pair(corpus=wids,window_size=window_size,vocab_size=vocab_size):
        loss += cbow.train_on_batch(x,y)
    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 641.1716136932373

Epoch: 2 	Loss: 637.1033148765564

Epoch: 3 	Loss: 631.6801605224609

Epoch: 4 	Loss: 627.7058334350586

Epoch: 5 	Loss: 624.9515080451965



In [10]:
weights = cbow.get_weights()[0]
#weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights,index=list(id2words.values())).head()

(81, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.004815,0.006754,-0.086791,0.096936,0.040963,0.040461,0.064534,0.03499,-0.002118,0.043349,...,0.002195,0.029286,-0.013582,-0.113106,0.014076,-0.035032,-0.017602,0.089543,-0.031172,-0.070135
of,-0.029084,-0.009117,-0.032017,0.03566,-0.038507,-0.011048,0.028994,-0.039945,-0.019305,0.025232,...,0.022102,-0.042298,-0.031882,-0.037099,-0.048667,-0.03365,0.043979,-0.045087,-0.004227,-0.035739
transmission,-0.0409,-0.00534,-0.047037,-0.007137,0.001978,0.042742,-0.011289,0.012471,0.034263,0.010296,...,0.045291,-0.020437,0.005371,0.03328,-0.015768,-0.025515,-0.035409,0.006361,0.015024,-0.010385
virus,0.029145,-0.014753,-0.038871,0.02638,-0.001047,0.042854,-0.030719,0.042499,-0.010629,-0.016621,...,-0.004158,-0.03149,0.042512,0.015915,-0.020992,-0.031081,-0.014198,0.025636,0.018408,0.022105
is,-0.013106,0.014141,-0.021812,-0.046914,-0.032404,0.015309,0.001895,-0.002291,0.044375,-0.01521,...,0.046496,-0.045101,0.049782,0.00535,-0.006712,-0.037921,0.015315,-0.030194,0.037116,0.009324


In [11]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape) 
# 0 1 2 3 4

inwords = input()

similar_words = { inwords : [id2words[idx] for idx in distance_matrix[word2id[inwords]].argsort()[0:6]]}
similar_words

(81, 81)


 virus


{'virus': ['virus', 'present', 'from', 'at', 'illness', 'potentially']}