In [1]:
# importing libraries
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd

In [2]:
#taking random sentences as data
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

In [3]:
#tokenization
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [4]:
#generating (context word, target/label word) pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)
            
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        # print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

In [5]:
#model building
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())

# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))



None


In [6]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 429.0836968421936

Epoch: 2 	Loss: 429.647958278656

Epoch: 3 	Loss: 427.86786222457886

Epoch: 4 	Loss: 426.17456912994385

Epoch: 5 	Loss: 424.7746205329895



In [7]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,0.009806,-0.016295,-0.011585,0.007788,0.045331,-0.027111,0.012627,0.033691,-0.029176,0.000855,...,0.027748,0.020728,0.058328,-0.013742,-0.042865,-0.022896,-0.050835,0.037517,0.027861,0.01232
networks,0.019918,-0.039554,0.056956,0.051058,-0.032229,0.009455,-0.045634,-0.025223,-0.017124,0.036002,...,0.033404,0.060911,0.015303,0.025187,-0.008355,-0.010786,-0.048719,0.065349,-0.014286,-0.025176
neural,-0.004682,0.029678,-0.040814,0.005468,-0.028351,0.018611,0.001819,0.024518,-0.015008,-0.0276,...,-0.013518,0.019277,0.03379,-0.046902,-0.007777,-0.045835,-0.014123,-0.013749,-0.038048,0.036759
and,0.016735,-0.00909,0.032796,0.03705,-0.025276,0.031447,-0.044795,0.009937,-0.046154,-0.02488,...,-0.048237,0.033169,0.040698,0.043496,-0.007086,-0.007433,-0.010193,-0.007328,0.01188,-0.039523
as,0.010303,0.026749,0.036683,-0.040839,0.004997,-0.01937,-0.016945,-0.031613,-0.035602,0.031528,...,0.000763,0.032113,-0.044521,0.034603,-0.005708,-0.031745,-0.049437,-0.047617,0.040496,0.008396


In [27]:
# Import necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

# Sample text data
text = "Machine learning is fascinating and continuously evolving with applications in various fields."

# Stage 1: Data Preparation
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2index = tokenizer.word_index
index2word = {v: k for k, v in word2index.items()}
vocab_size = len(word2index) + 1  # +1 for padding token

# Convert text to sequences of word indices
sequences = tokenizer.texts_to_sequences([text])[0]

# Define context window size
window_size = 2  # Number of context words on each side of the target word

# Stage 2: Generate Training Data
# Create pairs of context and target words
def generate_context_target_pairs(sequences, window_size):
    context_target_pairs = []
    for i, target in enumerate(sequences):
        start = max(0, i - window_size)
        end = min(len(sequences), i + window_size + 1)
        context = [sequences[j] for j in range(start, end) if j != i]
        for ctx_word in context:
            context_target_pairs.append((ctx_word, target))
    return context_target_pairs

context_target_pairs = generate_context_target_pairs(sequences, window_size)

# Convert pairs to input-output format for model training
X_train, y_train = zip(*context_target_pairs)
X_train = np.array(X_train)
y_train = to_categorical(y_train, vocab_size)

# Stage 3: Define and Train the CBOW Model
embedding_dim = 100  # Dimension of the word embeddings

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1),
    Dense(vocab_size, activation='softmax')
])

# Compile and train the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=256)

# Stage 4: Output: Check Word Embeddings
# Get the embeddings
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]

# Display similar words based on cosine similarity
def get_similar_words(word, embeddings, top_n=5):
    index = word2index[word]
    word_embedding = embeddings[index]
    similarities = [(other_word, np.dot(word_embedding, embeddings[other_index]))
                    for other_word, other_index in word2index.items() if other_word != word]
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Test the model by finding similar words
test_word = "learning"
similar_words = get_similar_words(test_word, embeddings)
print(f"Words similar to '{test_word}':", [word for word, _ in similar_words])


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.1190 - loss: 2.5607
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1667 - loss: 2.5545
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.1667 - loss: 2.5483
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.1667 - loss: 2.5422
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.1905 - loss: 2.5360
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.2143 - loss: 2.5299
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.2381 - loss: 2.5238
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.2381 - loss: 2.5176
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m