In [None]:
import re
import numpy as np


import tensorflow as tf
from tensorflow  import keras
from keras.layers import Dense,Softmax,Flatten,Embedding

np.random.seed(12)

In [1]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [2]:
def text_cleaning(text):
    "splitting the text up into smaller units like words, getting rid of punctuations"
    " output : list of words in the text"

    pattern  = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')

    return pattern.findall(text.lower())

In [3]:
clean_text =text_cleaning(text=text)
print(len(clean_text))

NameError: name 're' is not defined

In [None]:
print(clean_text)

In [None]:
def lookup_table(tokens):
    """ Lookup table  which helps to convert words to indices and indices to words"""
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(sorted(set(tokens))):
        word_to_id[token] = i
        id_to_word[i]= token

    return word_to_id, id_to_word

In [None]:
word_id ,id_word =lookup_table(tokens=clean_text)
print("Vocab size:", len(word_id))

In [None]:
encoded_text = [word_id[word] for word in clean_text]
print(encoded_text)

In [None]:
# for  i in clean_text:
#     print(i,word_id[i])

In [None]:
print(word_id)

In [None]:
print(id_word)

In [None]:
def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)
    
    for i in range(n_tokens):
        left_context = range(max(0, i - window), i)
        right_context = range(i + 1, min(n_tokens, i + window + 1))
        
        for j in list(left_context) + list(right_context):
            X.append(word_to_id[tokens[i]])   # center word index
            y.append(word_to_id[tokens[j]])   # context word index
    
    return np.array(X), np.array(y)


In [None]:
X, y = generate_training_data(clean_text, word_id, window=2)


embedding_dim = 10
vocab_size = len(word_id)  # depends on your cleaned text


# One-hot encode y (context)
y = keras.utils.to_categorical(y, num_classes=vocab_size)

model = keras.Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name="embedding"),
    Flatten(),
    Dense(vocab_size, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy")
model.fit(X, y, epochs=10, batch_size=32)


In [None]:
model.summary()

**Get embedding weights**

In [None]:
# Get the weights from the embedding layer
embedding_weights = model.get_layer("embedding").get_weights()[0]
print(embedding_weights.shape)

embedding_weights[0]

**Map words to embeddings**

In [None]:
# Example: get embedding for the word "machine"
word = "machine"
word_idx = word_id[word]
print("word_idx:",word_idx )
word_vector = embedding_weights[word_idx]
print(f"Embedding for '{word}':\n", word_vector)

In [None]:
len(embedding_weights)

In [None]:
word_vector = embedding_weights[word_idx]
word_vector

In [None]:
word_embeddings = {word: embedding_weights[idx] for word, idx in word_id.items()}

In [None]:
words = list(word_embeddings.keys())[:N]


**Reduce dimensions (PCA or t-SNE)**

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Choose top N words to visualize
N = 30
words = list(word_embeddings.keys())[:N]
X = np.array([word_embeddings[w] for w in words])

# First reduce with PCA 
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)


tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=2000)
X_tsne = tsne.fit_transform(X)



In [None]:
X_pca[:,0]

In [None]:
def plot_embeddings(points, labels, title):
    plt.figure(figsize=(26,6))
    plt.scatter(points[:,0], points[:,1], c='skyblue')
    for i, word in enumerate(labels):
        plt.annotate(word, (points[i,0]+0.01, points[i,1]+0.01))
    plt.title(title)
    plt.show()

plot_embeddings(X_pca, words, "Word Embeddings (PCA)")
plot_embeddings(X_tsne, words, "Word Embeddings (t-SNE)")


**Compare similarity between words:**


In [None]:
word_to_vec = {}
for idx, word in id_word.items():
    word_to_vec[word] = embedding_weights[idx]

# Example: get embedding for the word "machine"
print("machine vector:", word_to_vec["machine"])


In [None]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))

print("Similarity(machine, learning):", cosine_similarity(word_to_vec["machine"], word_to_vec["learning"]))


**Parameter Calculation**

1. Embedding Layer

Formula:

params
=
vocab_size
√ó
embedding_dim
params=vocab_size√óembedding_dim

From the table: 600 params

You set embedding_dim = 10, so:

600
=
ùë£
ùëú
ùëê
ùëé
ùëè
_
ùë†
ùëñ
ùëß
ùëí
√ó
10
‚ÄÖ‚Ää
‚üπ
‚ÄÖ‚Ää
ùë£
ùëú
ùëê
ùëé
ùëè
_
ùë†
ùëñ
ùëß
ùëí
=
60
600=vocab_size√ó10‚üπvocab_size=60

‚úÖ So your vocabulary size is 60 words.

2. Flatten Layer

Flatten has no trainable parameters.

It just reshapes (None, 10) ‚Üí (None, 10).

So params = 0.

3. Dense Layer

Formula:

params
=
(
input_dim
√ó
output_dim
)
+
bias_terms
params=(input_dim√óoutput_dim)+bias_terms

Input dim = 10 (from embedding).

Output dim = vocab_size = 60.

Bias terms = 60.

So:

(
10
√ó
60
)
+
60
=
600
+
60
=
660
(10√ó60)+60=600+60=660

**Why is input_length = 1?**

In Skip-gram training, each training sample consists of one center word.

That means your model sees input shaped like [center_word_index] (just a single integer).

Example:

If the sentence is ["machine", "learning", "is", "fun"]

A sample could be (center="learning", context="machine")

Input to the model: [word_id["learning"]] ‚Üí a single index ‚Üí input length = 1.

So the input sequence length is 1, because you‚Äôre not feeding multiple words at once ‚Äî only one word per training example.




If you change input_length=2, you are no longer really doing Skip-gram. Here‚Äôs why:


**Skip-gram**

Input = 1 center word

Output = 1 context word

So input_length = 1

(center = "learning") ‚Üí predict ("machine")
(center = "learning") ‚Üí predict ("is")



**CBOW (Continuous Bag of Words)**

Input = multiple context words (window around the center)

Output = 1 center word

So input_length > 1

Example (window = 2):


(context = ["machine", "is"]) ‚Üí predict ("learning")

**summary**
Simply setting input_length=2 in your current Skip-gram code doesn‚Äôt make it Skip-gram anymore.

If you keep your training data as (center ‚Üí context) pairs, then input length must stay 1.

If you change it to (context ‚Üí center) training pairs, then input length can be 2, 3, ‚Ä¶ depending on window size ‚Üí that becomes CBOW.

**Example: "I love natural language processing"**

Window size = 2

Center = "natural"

Contexts = ["I", "love", "language", "processing"]

Training pairs generated:

(natural ‚Üí I)
(natural ‚Üí love)
(natural ‚Üí language)
(natural ‚Üí processing)


Here:

Input length = 1 (just "natural")

Output is one of its context words (but since training loops over them, the center word learns to predict all of them).


**Important detail**

Skip-gram doesn‚Äôt predict ‚Äúall at once.‚Äù Instead:

For "natural" as input, the training dataset contains multiple samples, one per context.

So the model sees "natural" ‚Üí "I", then "natural" ‚Üí "love", etc.

Over training, the embedding learns to place "natural" close in vector space to its contexts.


You pass input of length 1 (the center word).

The model learns to predict all possible surrounding words (but through multiple samples, not a single multi-output).