<a href="https://colab.research.google.com/github/Mohamedh0/Amit/blob/main/WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Word Embeddings

### Building a Word2Vec Model using CBOW

In [1]:
# import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Sample corpus
corpus = [
    "the dog barks",
    "the dog chases the cat",
    "the cat sleeps",
    "the dog runs fast"
]

# 1. Data Preparation: Create a context-target pair (CBOW)
window_size = 2  # Context size is 2, meaning 2 words before and 2 words after the target

# Tokenizer to convert text to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # Add 1 for padding (if needed)

# Convert words to integer tokens
sequences = tokenizer.texts_to_sequences(corpus)

# Generate context-target pairs (X is context, y is target)
x = []
y = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        context = seq[i - window_size:i] + seq[i + 1:i + window_size + 1]
        target = seq[i]
        x.append(context)
        y.append(target)

# Convert x and y to numpy arrays
x = np.array(x)
y = np.array(y)

# 2. CBOW Model Creation
embedding_dim = 50  # size of the embeddings vector
model = Sequential()
model.add(Embedding(total_words, embedding_dim, input_length=window_size * 2))  # Embedding layer
model.add(Flatten())  # Flatten the output of the embedding layer
model.add(Dense(total_words, activation='softmax'))  # Dense layer for prediction

# 3. Model Compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 4. Train the model
model.fit(x, y, epochs=100, verbose=1)

Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 1.0000 - loss: 2.1292
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 2.1057
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 2.0822
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 1.0000 - loss: 2.0587
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 2.0352
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 1.0000 - loss: 2.0115
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 1.0000 - loss: 1.9879
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 1.0000 - loss: 1.9641
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x78e1e390f710>

## Skip-Gram

In [2]:
# Skip-Gram Model Implementation

# Context size (Window Size)
window_size = 2

# Generate context-target pairs for Skip-Gram
x_skipgram = []
y_skipgram = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        target = seq[i]
        context = seq[i - window_size:i] + seq[i + 1:i + window_size + 1]
        for word in context:
            x_skipgram.append([target])
            y_skipgram.append(word)

# Convert x_skipgram and y_skipgram to numpy arrays
x_skipgram = np.array(x_skipgram)
y_skipgram = np.array(y_skipgram)

# 2. Skip-Gram Model Creation
embedding_dim = 50  # size of the embeddings vector
model_skipgram = Sequential()
model_skipgram.add(Embedding(total_words, embedding_dim, input_length=1))  # Embedding layer
model_skipgram.add(Flatten())  # Flatten the output of the embedding layer
model_skipgram.add(Dense(total_words, activation='softmax'))  # Dense layer for prediction

# 3. Model Compilation
model_skipgram.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 4. Train the model
model_skipgram.fit(x, y, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 2.1833
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.0000e+00 - loss: 2.1569
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0000e+00 - loss: 2.1305
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 1.0000 - loss: 2.1042
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 2.0779
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 2.0516
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 1.0000 - loss: 2.0253
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 1.9991
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x78e1e388bbd0>

## N-Skip-Gram Model




In [3]:
# N-Skip-Gram Implementation
N=3 # We can vary N to predict more context words

x_nskipgram = []
y_nskipgram = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        target = seq[i]
        context = seq[i - N:i] + seq[i + 1:i + N + 1]
        for word in context:
            x_nskipgram.append([target])
            y_nskipgram.append(word)

x_nskipgram = np.array(x_nskipgram)
y_nskipgram = np.array(y_nskipgram)

model_nskipgram = Sequential()
model_nskipgram.add(Embedding(total_words, embedding_dim, input_length=1))
model_nskipgram.add(Flatten())
model_nskipgram.add(Dense(total_words, activation='softmax'))

model_nskipgram.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_nskipgram.fit(x, y, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 931ms/step - accuracy: 0.0000e+00 - loss: 2.1978
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.0000e+00 - loss: 2.1746
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0000e+00 - loss: 2.1515
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 2.1284
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 1.0000 - loss: 2.1055
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 1.0000 - loss: 2.0826
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 1.0000 - loss: 2.0598
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 2.0370
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x78e1e318efd0>

# What Are Pre-Trained Word Embeddings?

Pre-trained word embeddings are vector representations of words learned from large text corpora. These embeddings capture the semantic relationships between words and can be directly used in downstream tasks like text classification, sentiment analysis, and more.
## Benefits of Pre-Trained Embeddings

- **Rich Semantic Relationships**: They encapsulate meaningful word relationships, such as analogies (e.g., "king" - "man" + "woman" ≈ "queen").
- **Improved Performance**: Using pre-trained embeddings often yields better results compared to training embeddings from scratch, especially for small datasets.
- **Reduced Training Time**: Leveraging pre-trained embeddings reduces the computational effort and time needed to train models.


# Word2Vec: Skip-Gram Model

The **Skip-Gram Model** predicts the context words surrounding a target word within a specified window size.

## Objective Function

The Skip-Gram Model maximizes the conditional probability of the context words \( w_{t+j} \) given the target word \( w_t \). The objective function is:

$$
J(\theta) = -\sum_{t=1}^{T} \sum_{\substack{-C \leq j \leq C \\ j \neq 0}} \log p(w_{t+j} \mid w_t)
$$

Where:
- \( T \): Total number of words in the corpus.
- \( w_t \): Target word at position \( t \).
- \( w_{t+j} \): Context words within the window size \( C \), excluding \( w_t \) itself (\( j \neq 0 \)).
- \( p(w_{t+j} \mid w_t \): Probability of observing \( w_{t+j} \) given \( w_t \).

## Conditional Probability

The probability \( p(w_{c} \mid w_t) \) is defined as:

$$
p(w_{c} \mid w_t) = \frac{\exp \left( \mathbf{v}_{w_c}^\top \mathbf{v}_{w_t} \right)}{\sum_{w \in V} \exp \left( \mathbf{v}_w^\top \mathbf{v}_{w_t} \right)}
$$

Where:
- \( \mathbf{v}_{w_t} \): Embedding vector for the target word \( w_t \).
- \( \mathbf{v}_{w_c} \): Embedding vector for the context word \( w_c \).
- \( V \): Vocabulary size.


# Word2Vec: CBOW Model

The **CBOW Model** predicts the target word based on its surrounding context words.

## Objective Function

The CBOW Model maximizes the probability of predicting the target word \( w_t \) given its context words:

$$
J(\theta) = -\sum_{t=1}^{T} \log p(w_t \mid \{w_{t-C}, ..., w_{t-1}, w_{t+1}, ..., w_{t+C}\})
$$

Where:
- \( \{w_{t-C}, ..., w_{t-1}, w_{t+1}, ..., w_{t+C}\} \): Context words around \( w_t \).
- \( p(w_t \mid \text{context}) \): Probability of observing the target word given the context.


# Comparison: Skip-Gram vs. CBOW

| Feature              | Skip-Gram                         | CBOW                              |
|----------------------|-----------------------------------|-----------------------------------|
| **Objective**        | Predict context from target word | Predict target word from context |
| **Strengths**        | Works well for rare words         | Faster and computationally less expensive |
| **Use Case**         | Sparse datasets                  | Large corpora with common words  |


# GloVe: Global Vectors for Word Representation

## Objective Function

GloVe learns word embeddings by factorizing a word co-occurrence matrix \( X \). The objective function is:

$$
J = \sum_{i,j} f(X_{ij}) \left( w_i^T w_j + b_i + b_j - \log X_{ij} \right)^2
$$

Where:
- \( f(X_{ij}) \): Weighting function for co-occurrence.
- \( w_i, w_j \): Word embeddings for words \( i \) and \( j \).
- \( b_i, b_j \): Bias terms for words \( i \) and \( j \).

## Weighting Function

The weighting function \( f(X_{ij}) \) is:

$$
f(X_{ij}) =
\begin{cases}
\left( \frac{X_{ij}}{X_{\text{max}}} \right)^\alpha & \text{if } X_{ij} < X_{\text{max}} \\
1 & \text{otherwise}
\end{cases}
$$


# FastText: Subword Information

FastText incorporates subword information by representing each word as a bag of character n-grams.

## Embedding Generation

The embedding for a word is the sum of the embeddings of its subword n-grams:

$$
\text{Emb}(w) = \sum_{\text{ngram} \in \text{ngrams}(w)} \text{Emb}(\text{ngram})
$$

## Example

For the word **"apple"**, with an n-gram size of 2:
- Character n-grams: `ap`, `pp`, `pl`, `le`.
- The embedding for **"apple"** is:

$$
\text{Emb}(\text{"apple"}) = \text{Emb}(\text{"ap"}) + \text{Emb}(\text{"pp"}) + \text{Emb}(\text{"pl"}) + \text{Emb}(\text{"le"})
$$
