In [1]:
# 2. NN Based Embedding Models
# a) Word2Vec
# Implementation of Word2Vec without using any external libraries.

import numpy as np


In [2]:
# block 2 Input Sentence
sentences = [
     "i love machine learning",
    "machine learning is fun"
]

In [3]:
# bock 3 Tokenization + Vocabulary Creation
words = []
for sentence in sentences:
    words.extend(sentence.split())

vocab = list(set(words))  # Unique words  in the corpus (set removes duplicates)       
word_to_index = {word: i for i, word in enumerate(vocab)}   # Word to Index mapping 
index_to_word = {i: word for word, i in word_to_index.items()}  # Index to Word mapping

vocab_size = len(vocab)
print("Vocabulary:", vocab)
print("Word to Index:", word_to_index)
print("Index to Word:", index_to_word)

Vocabulary: ['is', 'fun', 'machine', 'love', 'i', 'learning']
Word to Index: {'is': 0, 'fun': 1, 'machine': 2, 'love': 3, 'i': 4, 'learning': 5}
Index to Word: {0: 'is', 1: 'fun', 2: 'machine', 3: 'love', 4: 'i', 5: 'learning'}


In [4]:
# block 4 Create CBOW Training pairs
window_size = 1
X = []
Y = []

for sentence in sentences:
    tokens = sentence.split()
    for i in range(window_size, len(tokens) - window_size):
        context = [
            tokens[i - 1],      # used to store previous word
            tokens[i + 1]       # used to store next word
        ]
        target = tokens[i]  # used to store current word

        X.append([word_to_index[w] for w in context])   # context words as indices
        Y.append(word_to_index[target]) # target word as index
        
print("Context-Target Pairs:")
for i in range(len(X)):
    context_words = [index_to_word[idx] for idx in X[i]]
    target_word = index_to_word[Y[i]]
    print(f"Context: {context_words} -> Target: {target_word}")


Context-Target Pairs:
Context: ['i', 'machine'] -> Target: love
Context: ['love', 'learning'] -> Target: machine
Context: ['machine', 'is'] -> Target: learning
Context: ['learning', 'fun'] -> Target: is


In [5]:
#block 5 One-Hot Encoding
def one_hot(index, size):
    vec = np.zeros(size)
    vec[index] = 1
    return vec


In [6]:
# block 6 Initialize Weights
embedding_dim = 5

W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size)


In [7]:
# block 7 Softmax Function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


In [8]:
# block 8 Forward Pass
learning_rate = 0.01

for epoch in range(500):
    loss = 0

    for context_indices, target_index in zip(X, Y):

        # Context embeddings
        h = np.mean([W1[idx] for idx in context_indices], axis=0)

        # Prediction
        u = np.dot(h, W2)
        y_pred = softmax(u)

        # Loss
        loss -= np.log(y_pred[target_index])

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")


Epoch 0, Loss: 11.260526887891208
Epoch 100, Loss: 11.260526887891208
Epoch 200, Loss: 11.260526887891208
Epoch 300, Loss: 11.260526887891208
Epoch 400, Loss: 11.260526887891208


In [9]:
y_pred = W1[word_to_index["machine"]]


In [10]:
predicted_index = np.argmax(y_pred)  # Get index of the predicted word (1)
predicted_word = index_to_word[predicted_index] # Convert index back to word
print(predicted_word)


machine


In [11]:
# Implementing Using Gensim
from gensim.models import Word2Vec

sentences = [
    ["i", "love", "machine", "learning"],
    ["machine", "learning", "is", "fun"],
    ["deep", "learning", "uses", "neural", "networks"]
]

In [12]:
# Train Cbow model
model = Word2Vec(
    sentences,
    vector_size=10,      # it means each word is represented by a 5-dimensional vector
    window=2,       #2 words before and after the target word
    min_count=1,        #Include all words, even if they appear once
    sg=0  # sg=0 --> CBOW, sg=1 --> Skip Gram
)

In [None]:
words = ["deep", "learning"]

vectors = [model.wv[word] for word in words]
combined_vector = np.mean(vectors, axis=0)  # Combine vectors by averaging

print(combined_vector.shape)


(10,)


In [17]:
model.wv.similar_by_vector(combined_vector, topn=5)  # Find top 5 similar words based on the combined vector

# cbow find semantic simalrity after the combined_vector by wv.similarity, uses cosine similarity
# but during training it uses dot product similarity predicts target words
# same is followed by the skip gram 

[('learning', 0.6315835118293762),
 ('deep', 0.5737736225128174),
 ('love', 0.544927179813385),
 ('neural', 0.16401201486587524),
 ('machine', 0.15487389266490936)]