In [1]:

#from keras.preprocessing import text
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [2]:
data =  """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 
"""
dl_data = data.split()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)

word2id = tokenizer.word_index #ishant : 1 "sharayu" :2
word2id['PAD'] = 0

id2words = {v:k for k,v in word2id.items()}

wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size =2

print("Vocabulary size: ", vocab_size)
print("Vocabulary sample: ", list(word2id.items())[:10])

Vocabulary size:  81
Vocabulary sample:  [('the', 1), ('of', 2), ('transmission', 3), ('virus', 4), ('is', 5), ('influenza', 6), ('a', 7), ('to', 8), ('covid', 9), ('19', 10)]


In [4]:
def generate_context_word_pair(corpus,window_size, vocab_size):
    context_length = window_size*2

    for words in corpus:
        sentence_length = len(words) 

        for index, word in enumerate(words): # [1 0 2 4 5]
            context_words = []
            label_word = []
            start = index-window_size
            end = index+window_size +1

            context_words.append([words[i]
                                  for i in range(start,end)
                                  if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            
            x = pad_sequences(context_words, maxlen= context_length)
            y = to_categorical(label_word,vocab_size)
            yield (x,y)

In [5]:
from keras.layers import Input
cbow = Sequential()
cbow.add(Input(shape=(window_size * 2,)))
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size))
cbow.add(Lambda(lambda x: K.mean(x,axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation = "softmax"))
cbow.compile(loss = "categorical_crossentropy", optimizer = "rmsprop")
print(cbow.summary())

None


In [6]:
for epoch in range(1,6):
    loss = 0

    for x,y in generate_context_word_pair(corpus=wids,window_size=window_size,vocab_size=vocab_size):
        loss += cbow.train_on_batch(x,y)
    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 640.6734366416931

Epoch: 2 	Loss: 636.4515295028687

Epoch: 3 	Loss: 631.1207275390625

Epoch: 4 	Loss: 627.2745604515076

Epoch: 5 	Loss: 624.5936894416809



In [7]:
weights = cbow.get_weights()[0]
#weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights,index=list(id2words.values())).head()

(81, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.05333,0.011227,-0.173072,-0.152492,0.062882,-0.010151,0.001718,0.076696,0.031424,-0.026211,...,-0.015818,0.00169,-0.018912,-0.013244,0.013348,-0.038829,-0.067512,-0.048054,-0.017963,-0.076511
of,-0.015303,-0.000612,0.029169,-0.034083,0.005656,0.017643,0.048259,-0.025672,0.017821,-0.009431,...,0.024185,-0.02716,0.043398,0.012799,0.004912,-0.010298,-0.006677,-0.044105,0.040657,0.044214
transmission,-0.011888,0.00899,0.037831,-0.04895,-0.017784,0.008072,0.034127,0.015351,-0.040646,0.032505,...,0.032781,0.022137,-0.009118,-0.031574,0.010815,-0.022854,0.014843,-0.041127,-0.018566,0.031268
virus,-0.024507,0.00894,0.026516,-0.048962,0.013671,-0.04534,-0.040857,-0.047489,-0.026844,0.029868,...,-0.029498,0.032729,0.038143,0.036142,-0.019876,0.011869,-0.008138,-0.043589,-0.022647,-0.006726
is,-0.005491,-0.010088,-0.041233,0.035999,0.036193,-0.024563,-0.017671,-0.014101,-0.038907,-0.005339,...,0.013404,0.031423,-0.02268,-0.006861,0.031686,0.013891,0.018757,0.036388,-0.023172,0.022938


In [11]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape) 
# 0 1 2 3 4

inwords = input()

similar_words = { inwords : [id2words[idx] for idx in distance_matrix[word2id[inwords]].argsort()[0:6]]}
similar_words

(81, 81)


 virus


{'virus': ['virus', 'present', 'from', 'at', 'illness', 'potentially']}