<a href="https://colab.research.google.com/github/KayalvizhiT513/CBOW-Algorithm-/blob/main/CBOW_1_context.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Correct one
import numpy as np
import pandas as pd
from collections import defaultdict

class CBOW:
    def __init__(self, vocab_size, embedding_dim, hidden_dim, learning_rate=0.001):
        np.random.seed(42)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        self.W1 = np.array([[-0.71984421, -0.46063877,  1.05712223,  0.34361829, -1.76304016],
                            [ 0.32408397, -0.38508228, -0.676922  ,  0.61167629,  1.03099952],
                            [ 0.93128012, -0.83921752, -0.30921238,  0.33126343,  0.97554513]])
        self.W2 = np.array([[-0.47917424, -0.18565898, -1.10633497],
                            [-1.19620662,  0.81252582,  1.35624003],
                            [-0.07201012,  1.0035329 ,  0.36163603],
                            [-0.64511975,  0.36139561,  1.53803657],
                            [-0.03582604,  1.56464366, -2.6197451 ]])

    def generate_one_hot(self, word_idx):
        one_hot = np.zeros(self.vocab_size)
        one_hot[word_idx] = 1
        return one_hot

    def linear_activation(self, x):
        return x

    def softmax(self, x):
        exp_x = np.exp(x)  #exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

    def forward_pass(self, context):
        hidden_layer_input = np.dot(self.W1, context)
        hidden_layer_activation = self.linear_activation(hidden_layer_input)
        output_layer = np.dot(self.W2, hidden_layer_activation)
        output_probs = self.softmax(output_layer)
        return hidden_layer_activation, output_probs

    def backward_pass(self, context, target, hidden_layer_activation, output_probs):
        error_output = output_probs - target
        dW2 = np.outer(error_output, hidden_layer_activation)
        error_hidden = np.dot(self.W2.T, error_output)
        dW1 = np.outer(error_hidden, context)
        return dW1, dW2

    def update_weights(self, dW1, dW2):
        self.W1 -= self.learning_rate * dW1
        self.W2 -= self.learning_rate * dW2

    def train(self, corpus, vocabulary = None, epochs=1000):
        epoch_vs_loss = {}
        for term in vocabulary:
            epoch_vs_loss[term] = {}
        for epoch in range(epochs):
            for sentence in corpus:
                context_words = sentence.split()[:-1]
                target_word = sentence.split()[-1]

                context_vectors = [self.generate_one_hot(self.word_index[word]) for word in context_words]
                context = np.mean(context_vectors, axis=0)
                target = self.generate_one_hot(self.word_index[target_word])

                hidden_layer_activation, output_probs = self.forward_pass(context)
                epoch_vs_loss[target_word][epoch] = output_probs
                dW1, dW2 = self.backward_pass(context, target, hidden_layer_activation, output_probs)
                self.update_weights(dW1, dW2)
        return epoch_vs_loss

    def predict(self, input_word):
        if input_word in self.word_index:
            input_vector = self.generate_one_hot(self.word_index[input_word])
            _, output_probs = self.forward_pass(input_vector)
            predicted_word_index = np.argmax(output_probs)
            return self.index_word[predicted_word_index], output_probs
        else:
            return "Word not in vocabulary", None

In [None]:
# Example usage:
corpus = [
    "eat apple",
    "eat banana",
    "eat sapota",
    "eat carrot"
]

word_count = defaultdict(int)
for sentence in corpus:
    for word in sentence.split():
        word_count[word] += 1

sorted_words = sorted(word_count, key=word_count.get, reverse=True)
word_index = {word: i for i, word in enumerate(sorted_words)}
index_word = {i: word for word, i in word_index.items()}
vocab_size = len(sorted_words)
embedding_dim = vocab_size  # Set embedding_dim to match vocab_size
hidden_dim = 3  # Set the desired number of hidden dimensions

cbow = CBOW(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim)
cbow.word_index = word_index
cbow.index_word = index_word
epoch_vs_loss = cbow.train(corpus, sorted_words, epochs=10000)

# Convert weight matrices to pandas DataFrames
W1_df = pd.DataFrame(cbow.W1, index=[f'Hidden_dim_{i}' for i in range(cbow.W1.shape[0])], columns=sorted_words)
W2_df = pd.DataFrame(cbow.W2, index=sorted_words, columns=[f'Hidden_dim_{i}' for i in range(cbow.W2.shape[1])])

print("The corpus: \n",corpus)
print("Final Input weight matrix of the hidden layer (W^T . x):")
print(W1_df)
print("\nFinal Output weight matrix of the hidden layer (W^T . h):")
print(W2_df)

The corpus: 
 ['eat apple', 'eat banana', 'eat sapota', 'eat carrot']
Final Input weight matrix of the hidden layer (W^T . x):
                   eat     apple    banana    sapota    carrot
Hidden_dim_0 -0.448621 -0.460639  1.057122  0.343618 -1.763040
Hidden_dim_1  1.657340 -0.385082 -0.676922  0.611676  1.031000
Hidden_dim_2  0.515000 -0.839218 -0.309212  0.331263  0.975545

Final Output weight matrix of the hidden layer (W^T . h):
        Hidden_dim_0  Hidden_dim_1  Hidden_dim_2
eat        -0.188656     -0.895901     -1.324995
apple      -1.034224      0.669129      1.224793
banana     -0.175211      1.146773      0.440723
sapota     -0.738841      0.637400      1.588048
carrot     -0.291405      1.999038     -2.398737


In [None]:
term = "eat"

predicted_word, output_probs = cbow.predict(term)

print("Predicted word after '{}': {}".format(term, predicted_word))
print("Softmax function output values (probabilities):")
for word, prob in zip(sorted_words, output_probs):
    print(f"{word}: {prob:.6f}")

Predicted word after 'eat': carrot
Softmax function output values (probabilities):
eat: 0.003419
apple: 0.248574
banana: 0.249184
sapota: 0.249065
carrot: 0.249758
