## Name: Vaibhav Bichave

## Implement the Continuous Bag of Words (CBOW) Model for the given (textual document) using the below steps:
    a. Data preparation
    b. Generate training data
    c. Train model
    d. Output

In [1]:
data ="""Deep learning (also known as deep structured learning) is part of a broader 
family of machine learning methods based on artificial neural networks with representation learning. 
Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures such as deep neural
networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural 
networks and Transformers have been applied to fields including computer vision, speech recognition, natural 
language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, 
material inspection and board game programs, where they have produced results comparable to and in some cases 
surpassing human expert performance."""

data = data.split()

In [2]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

word2id = tokenizer.word_index
word2id['PAD'] = 0

id2word = {v:k for k,v in word2id.items()}
wids = tokenizer.texts_to_sequences(data)

emb_size = 100
window_size = 2
vocab_size = len(word2id)

In [3]:
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [4]:
def cbow_model(corpus,vocab_size, window_size):
    context_length = window_size*2
    for words in corpus:
        sequences_size = len(words)
        for index,word in enumerate(words):
            context_word = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            context_word.append([words[i]
                               for i in range(start,end)
                               if 0<=i <sequences_size
                               and i!=index])
            label_word.append(word)
            
            x = pad_sequences(context_word,context_length)
            y = to_categorical(label_word,vocab_size)
            yield(x,y)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Lambda
import keras.backend as K

In [6]:
cbow = Sequential([
    Embedding(vocab_size,emb_size,input_length = window_size*2),
    Lambda(lambda x:K.mean(x,axis=1)),
    Dense(vocab_size,activation = 'softmax')
])

cbow.compile(loss='categorical_crossentropy', optimizer='adam')
cbow.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            7500      
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 75)                7575      
                                                                 
Total params: 15,075
Trainable params: 15,075
Non-trainable params: 0
_________________________________________________________________


In [7]:
for epochs in range(6):
    loss  = 0
    for x,y in cbow_model(corpus=wids,vocab_size = vocab_size,window_size=window_size):
        loss += cbow.train_on_batch(x,y)
    print("Epochs {} - Loss -> {}".format(epochs,loss))

Epochs 0 - Loss -> 435.33561515808105
Epochs 1 - Loss -> 429.302264213562
Epochs 2 - Loss -> 426.6885013580322
Epochs 3 - Loss -> 423.72633481025696
Epochs 4 - Loss -> 420.88743686676025
Epochs 5 - Loss -> 418.6238157749176


In [8]:
import pandas as pd
weights = cbow.get_weights()[0][:]
# pd.DataFrame(weights,index=word2id.keys())

In [9]:
### from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
data = pd.DataFrame(distance_matrix,index=word2id.keys())
data.columns = word2id.keys()

data

Unnamed: 0,learning,deep,networks,neural,and,as,of,machine,supervised,have,...,results,comparable,in,some,cases,surpassing,human,expert,performance,PAD
learning,0.000000,0.564340,0.520465,0.687309,0.702848,0.677681,0.736050,0.714989,0.718350,0.959826,...,0.707415,0.642867,0.712345,0.662297,0.745983,0.728790,0.722269,0.731950,0.675630,0.765723
deep,0.564340,0.000000,0.743636,0.701144,0.670640,0.683945,0.715158,0.725764,0.659227,0.914696,...,0.702825,0.670269,0.678129,0.623978,0.744906,0.704478,0.693541,0.752698,0.738534,0.722181
networks,0.520465,0.743636,0.000000,0.678898,0.697930,0.663260,0.728734,0.693935,0.675196,0.914873,...,0.707060,0.634636,0.642973,0.671950,0.676299,0.738150,0.694240,0.688614,0.644455,0.709829
neural,0.687309,0.701144,0.678898,0.000000,0.472190,0.356580,0.418329,0.407396,0.380304,0.650154,...,0.407154,0.363231,0.405016,0.365781,0.442707,0.425865,0.427338,0.395312,0.447651,0.457611
and,0.702848,0.670640,0.697930,0.472190,0.000000,0.430277,0.406545,0.419735,0.382753,0.669322,...,0.401106,0.399500,0.443710,0.380114,0.420614,0.431645,0.416170,0.421400,0.412978,0.376110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
surpassing,0.728790,0.704478,0.738150,0.425865,0.431645,0.435908,0.397487,0.413895,0.392436,0.689252,...,0.395883,0.441815,0.399707,0.380512,0.441793,0.000000,0.440634,0.398612,0.424760,0.437916
human,0.722269,0.693541,0.694240,0.427338,0.416170,0.400384,0.435623,0.393451,0.412255,0.715151,...,0.431959,0.377919,0.379137,0.363673,0.407000,0.440634,0.000000,0.422697,0.441135,0.401258
expert,0.731950,0.752698,0.688614,0.395312,0.421400,0.402563,0.424996,0.411999,0.408820,0.722799,...,0.387381,0.399963,0.357308,0.391456,0.418710,0.398612,0.422697,0.000000,0.413012,0.435040
performance,0.675630,0.738534,0.644455,0.447651,0.412978,0.394644,0.406125,0.395811,0.401579,0.666266,...,0.362889,0.418886,0.409257,0.425441,0.381308,0.424760,0.441135,0.413012,0.000000,0.401320


In [10]:
def SearchWord(WordList):
    similar_words ={}
    for search_term in WordList:
        if(search_term in word2id.keys()):
            similar_words[search_term]=[id2word[idx] for idx in 
                                        distance_matrix[word2id[search_term]-1].argsort()[0:5]+1] 
    return similar_words



In [11]:
SearchWord(['deep'])

{'deep': ['deep', 'learning', 'processing', 'some', 'supervised']}