Train custom Word2Vec word embeddings using TensorFlow and a skip-gram model on a small text corpus.

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import itertools
from collections import defaultdict

In [3]:
# Sample corpus
sentences = [
    "machine learning is fun",
    "deep learning is part of machine learning",
    "natural language processing is a field of ai",
    "word embeddings are learned representations",
    "tensorflow makes it easy to build models"
]

In [4]:
# Tokenize corpus
tokenizer=tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
word2idx=tokenizer.word_index
idx2word={v:k for k,v in word2idx.items()}
vocab_size=len(word2idx)+1
print(word2idx)
print(idx2word)

{'learning': 1, 'is': 2, 'machine': 3, 'of': 4, 'fun': 5, 'deep': 6, 'part': 7, 'natural': 8, 'language': 9, 'processing': 10, 'a': 11, 'field': 12, 'ai': 13, 'word': 14, 'embeddings': 15, 'are': 16, 'learned': 17, 'representations': 18, 'tensorflow': 19, 'makes': 20, 'it': 21, 'easy': 22, 'to': 23, 'build': 24, 'models': 25}
{1: 'learning', 2: 'is', 3: 'machine', 4: 'of', 5: 'fun', 6: 'deep', 7: 'part', 8: 'natural', 9: 'language', 10: 'processing', 11: 'a', 12: 'field', 13: 'ai', 14: 'word', 15: 'embeddings', 16: 'are', 17: 'learned', 18: 'representations', 19: 'tensorflow', 20: 'makes', 21: 'it', 22: 'easy', 23: 'to', 24: 'build', 25: 'models'}


In [6]:
# Generate skip-gram pairs
window_size=2
sequences=tokenizer.texts_to_sequences(sentences)
pairs=[]
for seq in sequences:
    for i,target_word in enumerate(seq):
        context_window=seq[max(i-window_size,0):i]+seq[i+1:i+window_size+1]
        for context_word in context_window:
            pairs.append((target_word,context_word))

#Purpose: Create pairs like (center word, context word) within a sliding window (skip-gram).
#Example: For "machine learning is fun" with window_size=2, "learning" → "machine", "is"            

In [8]:
# Convert to numpy arrays
targets,contexts=zip(*pairs)
targets=np.array(targets)
contexts=np.array(contexts)

In [9]:
# One-hot encode targets
context_labels=tf.keras.utils.to_categorical(contexts,num_classes=vocab_size)

In [10]:
# Define skip-gram model
embedding_dim=64
input_word=tf.keras.Input(shape=(1,))
embedding=tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim)(input_word)
x=tf.keras.layers.Reshape((embedding_dim,))(embedding)
output=tf.keras.layers.Dense(vocab_size,activation='softmax')(x)
model=tf.keras.Model(inputs=input_word,outputs=output)

In [11]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [12]:
model.fit(targets,context_labels,epochs=100,verbose=0)

<keras.src.callbacks.history.History at 0x2055dbcf2b0>

In [13]:
#Extract and View Learned Embeddings
embedding_weights = model.get_layer('embedding').get_weights()[0]
for word, idx in word2idx.items():
    vec = embedding_weights[idx][:5]  # First 5 dims for display
    print(f"{word}: {vec.round(3)}")

learning: [ 0.224 -0.207  0.314  0.077 -0.114]
is: [ 0.224 -0.036  0.246  0.339 -0.104]
machine: [ 0.098 -0.362  0.208  0.175  0.148]
of: [ 0.127 -0.103  0.202  0.032 -0.121]
fun: [ 0.201 -0.212  0.198  0.032  0.235]
deep: [ 0.2   -0.201  0.22   0.061  0.225]
part: [ 0.333 -0.084  0.119  0.07   0.007]
natural: [ 0.216  0.235  0.005  0.291 -0.132]
language: [ 0.24   0.198  0.002 -0.034 -0.076]
processing: [ 0.279  0.075  0.137 -0.193 -0.035]
a: [ 0.254 -0.011  0.019 -0.034 -0.148]
field: [ 0.203 -0.244 -0.121 -0.238 -0.079]
ai: [ 0.142 -0.133 -0.115 -0.039 -0.226]
word: [-0.212 -0.22   0.038 -0.076 -0.116]
embeddings: [-0.217 -0.295 -0.102 -0.113 -0.281]
are: [-0.324 -0.181 -0.316 -0.117  0.061]
learned: [-0.188 -0.168 -0.119 -0.187  0.034]
representations: [-0.219 -0.217 -0.088 -0.055 -0.186]
tensorflow: [-0.178 -0.017  0.191  0.093  0.255]
makes: [-0.304 -0.076  0.204 -0.037  0.315]
it: [-0.184  0.169 -0.016 -0.321  0.288]
easy: [-0.069  0.059  0.033 -0.152  0.29 ]
to: [-0.031 -0.057 