### Embeddings
BoW and Tf-Idf are useful for topic detection, spam detection,... But when it comes to predict the next word they fail to capture the sentiment

1. Assign each number a unique number to represent the word

This will cause a fail prediction, we can have (happy = 1, bad = 30, good = 31), so good is closer to bad than happy, thus good = bad, which is not what we wanted

2. One hot, represent each word as vector of 0, and mark 1 to the corresponding position. So it don't care the problem of assigning numbers

This is eat up a lot of memory unecessary, because this vector could be very large (consider the BoW), and still fail to capture the context

3. Word2Vec, suppose we have a vector of feature for certain word, then we can measure all other word in that vector

E.g. bird = [animal,can_fly][1,1] fox = [animal,cannot_fly][1,0].

So in the example we put the words into a 2D dimension, but this could easily take more than 2 dimensions.

This representation will capture their meaning, how?

E.g. Beijing - China + Japan = Tokyo, why? Think Beijing as capital of China, then we toggle China and add Japan, becoming capital of Japan which is Tokyo

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import random, math

corpus = [
    "the quick brown fox jumps over the lazy dog",
    "i love dogs and i love cats",
    "the dog chased the cat",
    "the fox is quick and the fox is clever",
    "cats and dogs are animals",
    "the bird can fly",
    "a fox is a wild animal",
    "birds can fly in the sky",
    "dogs bark and cats meow",
    "the quick bird flew over the lazy cat"
]

In [18]:
def tokenize(text) -> list:
    text = text.lower() # lowercase
    tokens = text.split(" ")
    return tokens

out = []
for phrase in corpus:
    out+=tokenize(phrase)

counter = Counter(out)
print(counter)

# Create dicts
vocab = counter.keys()
sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
w2i = {token: idx for idx, (token, _) in enumerate(sorted_tokens)}
i2w = {idx: token for idx, (token, _) in enumerate(sorted_tokens)}

# Skip-gram
window_size = 2
pairs = []
for phrase in corpus:
    tokens = tokenize(phrase)
    idxs = [w2i[w] for w in tokens]

    for i, center in enumerate(idxs):
        for w in range(-window_size, window_size+1):
            j = i + w
            if w == 0 or j < 0 or j >= len(idxs):
                continue
            pairs.append((center, idxs[j]))

print("\nExample pairs (center -> context):")
for i in range(5):
    print(i2w[pairs[i][0]], "->", i2w[pairs[i][1]])

n = len(sorted_tokens)
# Sample the negative (those words that never appeared with center)
# Instead of softmax on millions
freq = np.array([counter[i2w[i]] for i in range(n)], dtype=np.float64)
unigram = freq ** 0.75 # Avoid common word dominance, not sampling uniform
unigram /= unigram.sum() # normalized

embedding_dim = 25
# Embedding when its center word (hidden layer)
W_in = (np.random.rand(n,embedding_dim) - 0.5) / embedding_dim
# Embedding when its context word (output layer)
W_out = (np.random.rand(n,embedding_dim) - 0.5) / embedding_dim

def sigmoid(x):
    if x >= 0:
        z = np.exp(-x)
        return 1 / (1 + z)
    else:
        z = np.exp(x)
        return z / (1 + z)

epochs = 1000
alpha = 0.01
neg_sample = 5

for epoch in range(epochs):
    loss_sum = 0
    random.shuffle(pairs) # Don't let it memorize pairs

    for center,context in pairs:
        center_vec = W_in[center]
        output_vec = W_out[context]

        # Positive update
        score = sigmoid(np.dot(center_vec,output_vec))
        loss = -np.log(score + 1e-10)
        grad = alpha * (1-score)

        W_in[center] += grad * output_vec
        W_out[context] += grad * center_vec

        loss_sum += loss

        # Negative update
        negatives = np.random.choice(n,size=neg_sample,p=unigram)
        for neg in negatives:
            if neg == context:
                continue
            output_k = W_out[neg]
            score_neg = sigmoid(np.dot(center_vec,output_k))
            loss = -np.log(1 - score_neg + 1e-10)
            grad = alpha * (0 - score_neg)

            W_in[center] += grad * output_k
            W_out[neg] += grad * center_vec

            loss_sum += loss
        
    if (epoch + 1) % 50 == 0:
        # Every 50 epochs print loss
        print(f"Epoch {epoch+1}, Loss={loss_sum:.4f}")

embeddings = W_in

def cosine_similarity(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)

def nearest(word,topn = 5):
    if word not in w2i:
        return []
    idx = w2i[word]
    vec = embeddings[idx]
    sims = [cosine_similarity(vec,embeddings[i]) for i in range(n)]
    best = np.argsort(sims)[::-1][1:topn+1]
    return [(i2w[i], sims[i]) for i in best]

print("\nNearest neighbors for 'dog':", nearest("dog"))
print("Nearest neighbors for 'fox':", nearest("fox"))

Counter({'the': 10, 'fox': 4, 'and': 4, 'quick': 3, 'dogs': 3, 'cats': 3, 'is': 3, 'over': 2, 'lazy': 2, 'dog': 2, 'i': 2, 'love': 2, 'cat': 2, 'bird': 2, 'can': 2, 'fly': 2, 'a': 2, 'brown': 1, 'jumps': 1, 'chased': 1, 'clever': 1, 'are': 1, 'animals': 1, 'wild': 1, 'animal': 1, 'birds': 1, 'in': 1, 'sky': 1, 'bark': 1, 'meow': 1, 'flew': 1})

Example pairs (center -> context):
the -> quick
the -> brown
quick -> the
quick -> brown
quick -> fox
Epoch 50, Loss=524.3849
Epoch 100, Loss=437.3974
Epoch 150, Loss=374.7613
Epoch 200, Loss=330.7061
Epoch 250, Loss=313.7642
Epoch 300, Loss=310.8752
Epoch 350, Loss=295.2842
Epoch 400, Loss=292.6350
Epoch 450, Loss=298.0102
Epoch 500, Loss=295.5132
Epoch 550, Loss=275.0126
Epoch 600, Loss=294.2135
Epoch 650, Loss=282.3212
Epoch 700, Loss=280.0014
Epoch 750, Loss=279.9481
Epoch 800, Loss=280.4740
Epoch 850, Loss=287.4539
Epoch 900, Loss=301.5977
Epoch 950, Loss=278.2768
Epoch 1000, Loss=293.7246

Nearest neighbors for 'dog': [('cat', np.float64(0

In [None]:
with open("hamlet.txt", "r", encoding="utf-8") as f:
    corpus = f.read()


In [31]:
import re

def tokenize(text):
    text = text.lower()
    # Only keep alphabetic words, drop punctuation/numbers
    tokens = re.findall(r"[a-z]+", text)
    return tokens

sample = tokenize("The king's crown, shining bright in love and death.")
print(sample)

out = []
for phrase in corpus:
    out+=tokenize(phrase)

counter = Counter(out)
print(counter)

# Create dicts
vocab = counter.keys()
sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
w2i = {token: idx for idx, (token, _) in enumerate(sorted_tokens)}
i2w = {idx: token for idx, (token, _) in enumerate(sorted_tokens)}

# Downsample, since its too large
total_count = sum(counter.values())
threshold = 1e-5

freq = np.array([counter[i2w[i]] for i in range(len(w2i))], dtype=np.float64)
prob = freq / total_count

# P(discard word) = 1 - sqrt(t / f)
discard_prob = 1 - np.sqrt(threshold / prob)
discard_prob = np.clip(discard_prob, 0, 1)  # ensure [0,1]

filtered_tokens = []
for w in out:
    if w not in w2i:   # safeguard
        continue
    if np.random.rand() > discard_prob[w2i[w]]:
        filtered_tokens.append(w)


print("Original tokens:", len(out))
print("Filtered tokens:", len(filtered_tokens))

tokens = filtered_tokens  
# Skip-gram
window_size = 2
pairs = []

idxs = [w2i[w] for w in filtered_tokens]  # use filtered_tokens instead of corpus
for i, center in enumerate(idxs):
    for w in range(-window_size, window_size + 1):
        j = i + w
        if w == 0 or j < 0 or j >= len(idxs):
            continue
        pairs.append((center, idxs[j]))

print("\nExample pairs (center -> context):")
for i in range(min(5, len(pairs))):
    print(i2w[pairs[i][0]], "->", i2w[pairs[i][1]])

n = len(sorted_tokens)
# Sample the negative (those words that never appeared with center)
# Instead of softmax on millions
freq = np.array([counter[i2w[i]] for i in range(n)], dtype=np.float64)
unigram = freq ** 0.75 # Avoid common word dominance, not sampling uniform
unigram /= unigram.sum() # normalized

embedding_dim = 50 # Increased
# Embedding when its center word (hidden layer)
W_in = (np.random.rand(n,embedding_dim) - 0.5) / embedding_dim
# Embedding when its context word (output layer)
W_out = (np.random.rand(n,embedding_dim) - 0.5) / embedding_dim

def sigmoid(x):
    if x >= 0:
        z = np.exp(-x)
        return 1 / (1 + z)
    else:
        z = np.exp(x)
        return z / (1 + z)

epochs = 200
alpha = 0.01
neg_sample = 5

for epoch in range(epochs):
    loss_sum = 0
    random.shuffle(pairs) # Don't let it memorize pairs

    for center,context in pairs:
        center_vec = W_in[center]
        output_vec = W_out[context]

        # Positive update
        score = sigmoid(np.dot(center_vec,output_vec))
        loss = -np.log(score + 1e-10)
        grad = alpha * (1-score)

        W_in[center] += grad * output_vec
        W_out[context] += grad * center_vec

        loss_sum += loss

        # Negative update
        negatives = np.random.choice(n,size=neg_sample,p=unigram)
        for neg in negatives:
            if neg == context:
                continue
            output_k = W_out[neg]
            score_neg = sigmoid(np.dot(center_vec,output_k))
            loss = -np.log(1 - score_neg + 1e-10)
            grad = alpha * (0 - score_neg)

            W_in[center] += grad * output_k
            W_out[neg] += grad * center_vec

            loss_sum += loss
        
    if (epoch + 1) % 50 == 0:
        # Every 50 epochs print loss
        print(f"Epoch {epoch+1}, Loss={loss_sum:.4f}")

embeddings = W_in

def cosine_similarity(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)

def nearest(word,topn = 5):
    if word not in w2i:
        return []
    idx = w2i[word]
    vec = embeddings[idx]
    sims = [cosine_similarity(vec,embeddings[i]) for i in range(n)]
    best = np.argsort(sims)[::-1][1:topn+1]
    return [(i2w[i], sims[i]) for i in best]


['the', 'king', 's', 'crown', 'shining', 'bright', 'in', 'love', 'and', 'death']
Counter({'e': 14960, 't': 11863, 'o': 11218, 'a': 9950, 'h': 8731, 'i': 8511, 's': 8379, 'n': 8297, 'r': 7777, 'l': 5847, 'd': 5025, 'u': 4343, 'm': 4253, 'y': 3204, 'w': 3132, 'f': 2698, 'c': 2606, 'g': 2420, 'p': 2016, 'b': 1830, 'k': 1272, 'v': 1222, 'q': 220, 'x': 179, 'j': 110, 'z': 72})
Original tokens: 130135
Filtered tokens: 1920

Example pairs (center -> context):
a -> h
a -> h
h -> a
h -> h
h -> e
Epoch 50, Loss=22680.7540
Epoch 100, Loss=24457.0781
Epoch 150, Loss=57229.7477
Epoch 200, Loss=119028.9095


In [32]:
print(nearest("king"))
print(nearest("love"))
print(nearest("death"))

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

words = ["king", "queen", "man", "woman", "love", "death", "life", "hamlet"]
vecs = np.array([embeddings[w2i[w]] for w in words if w in w2i])

pca = PCA(n_components=2)
reduced = pca.fit_transform(vecs)

plt.figure(figsize=(8,6))
for i, w in enumerate(words):
    if w in w2i:
        x, y = reduced[i]
        plt.scatter(x, y)
        plt.text(x+0.01, y+0.01, w, fontsize=12)
plt.title("Word2Vec Embeddings (PCA Projection)")
plt.show()


[]
[]
[]


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.