In [22]:
import numpy as np
import re
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Lambda
import tensorflow as tf

# a.Data preparation

In [23]:
data="""I love to play Football",
"Football is a great game",
"The team played well",
"Football brings people together",
"I enjoy watching football matches"""

In [24]:
sentences=data.split('.')
sentences

['I love to play Football",\n"Football is a great game",\n"The team played well",\n"Football brings people together",\n"I enjoy watching football matches']

In [25]:
clean_sent=[]
for sentence in sentences:
    if sentence=="":
        continue
    sentence=re.sub('[^A-Za-z0-9]+',' ',(sentence))
    sentence=re.sub(r'(?:^| )\w (?:$| )',' ',(sentence)).strip()
    sentence=sentence.lower()
    clean_sent.append(sentence)
print(sentence)

i love to play football football is a great game the team played well football brings people together i enjoy watching football matches


In [26]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences=tokenizer.texts_to_sequences(clean_sent)

In [27]:
index_to_word={}
word_to_index={}

for i,sequence in enumerate(sequences):
    word_in_sentence=clean_sent[i].split()
    for j,value in enumerate(sequence):
        index_to_word[value]=word_in_sentence[j]
        word_to_index[word_in_sentence[j]]=value
print(index_to_word,"\n")
print(word_to_index)
    

{2: 'i', 3: 'love', 4: 'to', 5: 'play', 1: 'football', 6: 'is', 7: 'a', 8: 'great', 9: 'game', 10: 'the', 11: 'team', 12: 'played', 13: 'well', 14: 'brings', 15: 'people', 16: 'together', 17: 'enjoy', 18: 'watching', 19: 'matches'} 

{'i': 2, 'love': 3, 'to': 4, 'play': 5, 'football': 1, 'is': 6, 'a': 7, 'great': 8, 'game': 9, 'the': 10, 'team': 11, 'played': 12, 'well': 13, 'brings': 14, 'people': 15, 'together': 16, 'enjoy': 17, 'watching': 18, 'matches': 19}


In [28]:
vocab_size=len(tokenizer.word_index)+1
emb_size=50
context_size=2

contexts=[]
targets=[]

for sequence in sequences:
    for i in range(context_size,len(sequence)-context_size):
        target=sequence[i]
        context=[sequence[i-2],
                 sequence[i-1],
                 sequence[i+1],
                 sequence[i+2],
                ]
        contexts.append(context)
        targets.append(target)
print(contexts,"\n")
print(targets)

[[2, 3, 5, 1], [3, 4, 1, 1], [4, 5, 1, 6], [5, 1, 6, 7], [1, 1, 7, 8], [1, 6, 8, 9], [6, 7, 9, 10], [7, 8, 10, 11], [8, 9, 11, 12], [9, 10, 12, 13], [10, 11, 13, 1], [11, 12, 1, 14], [12, 13, 14, 15], [13, 1, 15, 16], [1, 14, 16, 2], [14, 15, 2, 17], [15, 16, 17, 18], [16, 2, 18, 1], [2, 17, 1, 19]] 

[4, 5, 1, 1, 6, 7, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16, 2, 17, 18]


In [29]:
for i in range(5):
    words=[]
    target=index_to_word.get(targets[i])
    for j in contexts[i]:
        words.append(index_to_word.get(j))
    print(words,"->",target)
        
        

['i', 'love', 'play', 'football'] -> to
['love', 'to', 'football', 'football'] -> play
['to', 'play', 'football', 'is'] -> football
['play', 'football', 'is', 'a'] -> football
['football', 'football', 'a', 'great'] -> is


In [30]:
X=np.array(contexts)
Y=np.array(targets)

# Train the model

In [31]:
#Build the CBOW model
model= Sequential([
    Embedding(input_dim=vocab_size,output_dim=emb_size,input_length=2*context_size),
    Lambda(lambda x:tf.reduce_mean(x,axis=1)),
    Dense(256,activation='relu'),
    Dense(512,activation='relu'),
   Dense(vocab_size,activation='softmax')
])

In [32]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(X,Y,epochs=36)

Epoch 1/36
Epoch 2/36
Epoch 3/36
Epoch 4/36
Epoch 5/36
Epoch 6/36
Epoch 7/36
Epoch 8/36
Epoch 9/36
Epoch 10/36
Epoch 11/36
Epoch 12/36
Epoch 13/36
Epoch 14/36
Epoch 15/36
Epoch 16/36
Epoch 17/36
Epoch 18/36
Epoch 19/36
Epoch 20/36
Epoch 21/36
Epoch 22/36
Epoch 23/36
Epoch 24/36
Epoch 25/36
Epoch 26/36
Epoch 27/36
Epoch 28/36
Epoch 29/36
Epoch 30/36
Epoch 31/36
Epoch 32/36
Epoch 33/36
Epoch 34/36
Epoch 35/36
Epoch 36/36


# Output(Model Prediction)

In [33]:
def predict_word(model,context):
    context=np.array(context).reshape(1,-1)
    predicted_word=model.predict(context)
    return index_to_word[np.argmax(predicted_word)]
example_context=[1,2,3,4]
predictions=predict_word(model,example_context)
print(f'Predicted word:{predictions}')

Predicted word:play
