# **ONE-HOT ENCODING**

In [None]:
from keras.utils import to_categorical

# Example categorical data (class labels)
class_labels = [0, 1, 2, 1, 0]

# One-hot encoding using Keras to_categorical
one_hot_encoded = to_categorical(class_labels)

print("Original Labels:")
print(class_labels)

print("\nOne-Hot Encoded Matrix:")
print(one_hot_encoded)

Original Labels:
[0, 1, 2, 1, 0]

One-Hot Encoded Matrix:
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


# **SIMPLE WORD2VEC**

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
# Sample Corpus
corpus = ["I like natural language processing", "Word embeddings are interesting", "Skip-gram is a Word2Vec model"]

In [None]:
# Tokenizing the corpus
tokens = [sentence.split() for sentence in corpus]
flatten_tokens = [word for sentence in tokens for word in sentence]
vocab = list(set(flatten_tokens))
vocab_size = len(vocab)

In [None]:
# Create word-to-index and index-to-word dictionaries
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for i, word in enumerate(vocab)}

In [None]:
# GENERATING TRAINING DATA
window_size = 1

def generate_training_data(tokens, word_to_index, window_size):
  training_data = []
  for sentence in tokens:
    for i, target_word in enumerate(sentence):
      for context_word in sentence[max(0, i - window_size):i] + sentence[i + 1:i + window_size + 1]:
        training_data.append((word_to_index[target_word], word_to_index[context_word]))
  return np.array(training_data)

training_data = generate_training_data(tokens, word_to_index, window_size)

In [None]:
# Define Word2Vec skip-gram model
embedding_dim = 10
num_negative_samples = 5

model = tf.keras.Sequential([
    tf.keras. layers. Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name='embedding_layer'),
    tf.keras. layers. Reshape((embedding_dim,)),
    tf.keras.layers. Dense(units=vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# Train the model
target_words = training_data[:, 0]
context_words = training_data[:, 1]

model.fit(target_words, context_words, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7da69063a3e0>

In [None]:
#Access learned word embedding
word_embeddings = model.get_layer('embedding_layer').get_weights()[0]

In [None]:
# Display word embeddings
for i, word in enumerate(vocab):
  print(f"{word}: {word_embeddings[i]}")

like: [-0.01584898  0.02639333  0.01947837 -0.02004241  0.014462    0.02284228
 -0.01243934 -0.04550199  0.03021689 -0.0228904 ]
is: [ 0.01767459 -0.02180789  0.0550945   0.0459966   0.02170557  0.03016113
 -0.01729493  0.00394569  0.00717042  0.03056948]
processing: [-0.06548396 -0.03083901  0.04779163  0.00963059 -0.05895626 -0.00925287
 -0.02581323  0.04552416 -0.01265028  0.00944011]
Word2Vec: [ 0.02607585 -0.04561068  0.00207685 -0.06296484  0.04004294  0.02914419
 -0.0326288  -0.01819486 -0.01445714 -0.03039594]
embeddings: [ 0.00535995 -0.013449   -0.02419338 -0.02730744 -0.01332071  0.00286436
  0.05213508 -0.00260303  0.03849486 -0.06083161]
Skip-gram: [ 0.02280479  0.00982097 -0.06151413 -0.03708856 -0.00076144  0.06512374
  0.01678607  0.03089136  0.02266162  0.0188115 ]
Word: [ 0.0282562   0.06555833 -0.05051512  0.02214844  0.04730291 -0.05085679
  0.00359262 -0.00315082  0.00096763 -0.01506761]
a: [-0.0493577  -0.03445909  0.05503394  0.02310545 -0.00438069 -0.00372183
  