#Implementation of word2vec using numpy.

In [15]:
import numpy as np
from collections import defaultdict

class Word2Vec:
    def __init__(self, corpus, embedding_dim, window_size=2, learning_rate=0.01):
        self.corpus = corpus
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.word_index = {}
        self.index_word = {}
        self.vocab_size = 0
        self.data = []
        self.build_vocab()
        self.init_weights()

    def build_vocab(self):
        unique_words = list(set(self.corpus))
        self.vocab_size = len(unique_words)
        for i, word in enumerate(unique_words):
            self.word_index[word] = i
            self.index_word[i] = word

        for i, target_word in enumerate(self.corpus):
            target_index = self.word_index[target_word]
            context_words = self.get_context_words(i)
            for context_word in context_words:
                context_index = self.word_index[context_word]
                self.data.append((target_index, context_index))

    def init_weights(self):
        self.W1 = np.random.rand(self.vocab_size, self.embedding_dim)
        self.W2 = np.random.rand(self.embedding_dim, self.vocab_size)

    def get_context_words(self, target_index):
        context_words = []
        start = max(0, target_index - self.window_size)
        end = min(len(self.corpus), target_index + self.window_size + 1)
        for i in range(start, end):
            if i != target_index:
                context_words.append(self.corpus[i])
        return context_words

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0, keepdims=True)

    def forward_pass(self, target_index):
        target_vector = self.W1[target_index]
        u = np.dot(self.W2.T, target_vector)
        y_pred = self.softmax(u)
        return y_pred, target_vector

    def backward_pass(self, target_index, context_index, y_pred, target_vector):
        y_true = np.zeros(self.vocab_size)
        y_true[context_index] = 1

        e = y_pred - y_true
        dW2 = np.outer(target_vector, e)
        dW1 = np.dot(self.W2, e)

        self.W1[target_index] -= self.learning_rate * dW1
        self.W2 -= self.learning_rate * dW2

    def train(self, epochs):
        for epoch in range(epochs):
            loss = 0
            for target_index, context_index in self.data:
                y_pred, target_vector = self.forward_pass(target_index)
                loss += -np.log(y_pred[context_index])
                self.backward_pass(target_index, context_index, y_pred, target_vector)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")

    def get_word_vector(self, word):
        if word in self.word_index:
            return self.W1[self.word_index[word]]
        else:
            return None

# Example Usage
corpus = ["this", "is", "a", "sample", "corpus", "for", "word2vec"]
embedding_dim = 10

word2vec_model = Word2Vec(corpus, embedding_dim)
word2vec_model.train(epochs=100)

# Get the word vector for a specific word
word_vector = word2vec_model.get_word_vector("sample")
print(f"\nVector for 'sample': {word_vector}")


Epoch 1/100, Loss: 45.21946973215675
Epoch 2/100, Loss: 44.71142553882151
Epoch 3/100, Loss: 44.257110838619106
Epoch 4/100, Loss: 43.84823830693104
Epoch 5/100, Loss: 43.47806044861758
Epoch 6/100, Loss: 43.14101130997983
Epoch 7/100, Loss: 42.83244986013917
Epoch 8/100, Loss: 42.54847204231621
Epoch 9/100, Loss: 42.28577017477153
Epoch 10/100, Loss: 42.041525657244144
Epoch 11/100, Loss: 41.8133255605026
Epoch 12/100, Loss: 41.599096666817594
Epoch 13/100, Loss: 41.39705249729026
Epoch 14/100, Loss: 41.205650178182196
Epoch 15/100, Loss: 41.02355489168032
Epoch 16/100, Loss: 40.849610271099685
Epoch 17/100, Loss: 40.68281352877264
Epoch 18/100, Loss: 40.52229440694553
Epoch 19/100, Loss: 40.3672972576538
Epoch 20/100, Loss: 40.21716571339691
Epoch 21/100, Loss: 40.07132952452147
Epoch 22/100, Loss: 39.929293223893595
Epoch 23/100, Loss: 39.79062634321742
Epoch 24/100, Loss: 39.65495495415733
Epoch 25/100, Loss: 39.52195434538573
Epoch 26/100, Loss: 39.39134267670928
Epoch 27/100, Los