# **Using TensorFlow**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Sample corpus
corpus = [
    "I love deep learning and neural networks",
    "deep learning is amazing and powerful",
    "I love AI and its applications",
    "AI is the future of technology",
    "Natural language processing is a branch of AI"
]

#Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
words = tokenizer.word_docs
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

#Convert each word in the sentence to its corresponding index
sequences = tokenizer.texts_to_sequences(corpus)

# Create CBOW context-target pairs
window_size = 2
context_target_pairs = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        context = seq[i - window_size:i] + seq[i + 1:i + 1 + window_size]
        target = seq[i]
        context_target_pairs.append((context, target))


# Prepare inputs and outputs
X, y = zip(*context_target_pairs)
X = np.array(X)
y = np.array(y)
y = to_categorical(y, num_classes=vocab_size)  # One-hot encoding

In [None]:
# Define CBOW Model
embedding_dim = 50  # Size of word embeddings
input_layer = Input(shape=(window_size * 2,), name="input")
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=window_size * 2, name="embedding")(input_layer)
context_vector = Lambda(lambda x: tf.reduce_mean(x, axis=1), name="context_vector")(embedding_layer)
output_layer = Dense(vocab_size, activation='softmax', name="output")(context_vector)

cbow_model = Model(input_layer, output_layer)
cbow_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
cbow_model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 852ms/step - accuracy: 0.0000e+00 - loss: 3.1385
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0769 - loss: 3.1327
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0769 - loss: 3.1270
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.1538 - loss: 3.1212
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.1538 - loss: 3.1154
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2308 - loss: 3.1096
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.3077 - loss: 3.1039
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.3846 - loss: 3.0981
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x78bcb0c7b790>

In [None]:
# Save the embeddings
embeddings = cbow_model.get_layer("embedding").get_weights()[0]
word_embeddings = {word: embeddings[idx] for word, idx in word_index.items()}

# **Using PyTorch**

In [20]:
from collections import Counter
import torch

# Sample corpus
corpus = [
    "I love deep learning and neural networks",
    "deep learning is amazing and powerful",
    "I love AI and its applications",
    "AI is the future of technology",
    "Natural language processing is a branch of AI"
]

tokenized_corpus = [sentence.lower().split() for sentence in corpus]
word_counts = Counter(word for sentence in tokenized_corpus for word in sentence)
word_index = {word: idx for idx, word in enumerate(word_counts.keys())}
idx_to_word = {idx: word for word, idx in word_index.items()}
vocab_size = len(word_index)

#build sequences
sequences = [[word_index[word] for word in sentence] for sentence in tokenized_corpus]

# Create CBOW context-target pairs
window_size = 2
context_target_pairs = []
for seq in sequences:
  for i in range(window_size, len(seq) - window_size):
    context = seq[i - window_size:i] + seq[i + 1:i + 1 + window_size]
    target = seq[i]
    context_target_pairs.append((context, target))

# Convert to PyTorch tensors
X = torch.tensor([context for context, target in context_target_pairs], dtype=torch.long)
y = torch.tensor([target for _, target in context_target_pairs], dtype=torch.long)

print(X)
print(y)

tensor([[ 0,  1,  3,  4],
        [ 1,  2,  4,  5],
        [ 2,  3,  5,  6],
        [ 2,  3,  8,  4],
        [ 3,  7,  4,  9],
        [ 0,  1,  4, 11],
        [ 1, 10, 11, 12],
        [10,  7, 14, 15],
        [ 7, 13, 15, 16],
        [17, 18,  7, 20],
        [18, 19, 20, 21],
        [19,  7, 21, 15],
        [ 7, 20, 15, 10]])
torch.Size([13])


In [40]:
from torch import nn
from torch.optim import SGD
class CBOW(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
     super().__init__()
     self.embedding = nn.Embedding(vocab_size, embedding_dim)
     self.linear = nn.Linear(embedding_dim, vocab_size)

  def forward(self, context):
    context_embedding = self.embedding(context)
    context_vector = context_embedding.mean(dim=1)
    output = self.linear(context_vector)
    return output


# Model Parameters
embedding_dim = 50
model = CBOW(vocab_size, embedding_dim)

criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01)

In [43]:
#Training

epochs = 500
for epoch in range(epochs):
  optimizer.zero_grad()
  output = model(X)
  loss = criterion(output, y)
  loss.backward()
  optimizer.step()

  if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")

# Save embeddings
embeddings = model.embedding.weight.detach().numpy()
word_embeddings = {word: embeddings[idx] for word, idx in word_index.items()}

Epoch [0/500], Loss: 0.5397
Epoch [10/500], Loss: 0.5280
Epoch [20/500], Loss: 0.5168
Epoch [30/500], Loss: 0.5060
Epoch [40/500], Loss: 0.4955
Epoch [50/500], Loss: 0.4854
Epoch [60/500], Loss: 0.4756
Epoch [70/500], Loss: 0.4661
Epoch [80/500], Loss: 0.4570
Epoch [90/500], Loss: 0.4481
Epoch [100/500], Loss: 0.4395
Epoch [110/500], Loss: 0.4312
Epoch [120/500], Loss: 0.4232
Epoch [130/500], Loss: 0.4154
Epoch [140/500], Loss: 0.4078
Epoch [150/500], Loss: 0.4005
Epoch [160/500], Loss: 0.3934
Epoch [170/500], Loss: 0.3865
Epoch [180/500], Loss: 0.3798
Epoch [190/500], Loss: 0.3733
Epoch [200/500], Loss: 0.3670
Epoch [210/500], Loss: 0.3608
Epoch [220/500], Loss: 0.3549
Epoch [230/500], Loss: 0.3491
Epoch [240/500], Loss: 0.3435
Epoch [250/500], Loss: 0.3380
Epoch [260/500], Loss: 0.3327
Epoch [270/500], Loss: 0.3275
Epoch [280/500], Loss: 0.3225
Epoch [290/500], Loss: 0.3176
Epoch [300/500], Loss: 0.3128
Epoch [310/500], Loss: 0.3082
Epoch [320/500], Loss: 0.3037
Epoch [330/500], Loss