In [12]:
import json
from typing import cast
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from transformers import AutoTokenizer
from pprint import pprint

In [13]:
PAGES = json.load(open("../10k-vital-articles/data/pages.json"))
TOKENIZER = AutoTokenizer.from_pretrained("gpt2")

In [14]:
sequences: list[list[int]] = []
id_to_token: dict[int, str] = {}


for content in [PAGES[i]["wiki_intro"] for i in range(10)]:
# for content in ["Hey hey hey! How's it going?"]:
    batch_encoding = TOKENIZER(content, return_tensors="pt")
    tokens = TOKENIZER.tokenize(content)

    input_ids: list[int] = cast(torch.Tensor, batch_encoding["input_ids"]).flatten().tolist()

    for input_id, token in zip(input_ids, tokens):
        id_to_token[int(input_id)] = token

    sequences.append(input_ids)

print("After converting our words in the corpus into vector of integers:")
print(sequences)

Token indices sequence length is longer than the specified maximum sequence length for this model (1374 > 1024). Running this sequence through the model will result in indexing errors


After converting our words in the corpus into vector of integers:
[[27007, 16438, 6764, 91, 35443, 8876, 290, 3356, 11709, 198, 27007, 6395, 3544, 91, 2025, 9282, 91, 2025, 998, 1042, 357, 6381, 4131, 328, 2288, 14726, 2025, 998, 396, 357, 6381, 4131, 328, 2288, 8, 11709, 198, 27007, 10248, 2708, 11709, 198, 27007, 47, 79, 12, 325, 11632, 12, 521, 891, 11709, 198, 27007, 11041, 3517, 3594, 91, 4475, 28, 17908, 33448, 11709, 198, 27007, 11041, 288, 1820, 9667, 91, 4475, 28, 18517, 48609, 11709, 198, 27007, 11041, 34464, 2366, 17815, 91, 4475, 28, 6747, 1160, 1954, 11709, 198, 27007, 2025, 998, 1042, 40217, 11709, 198, 198, 7061, 6, 2025, 998, 1042, 7061, 6, 318, 257, 16410, 23149, 8876, 11907, 290, 16410, 35443, 3356, 91, 21084, 434, 11907, 326, 12932, 284, 35531, 477, 6712, 326, 43874, 16410, 9800, 414, 60, 4357, 32000, 11, 393, 16410, 20636, 18911, 91, 71, 959, 9282, 60, 4357, 7525, 10822, 262, 16410, 5219, 357, 16104, 414, 14726, 5219, 11907, 290, 16410, 27544, 1042, 60, 4083, 27, 54

In [15]:
# Define parameters
vocab_size = len(TOKENIZER)
embedding_size = 10
window_size = 2

# Generate context-target pairs
contexts = []
targets = []
for seq in sequences:
    # Skip sentences not long enough for the window
    if len(seq) < 2 * window_size + 1:
        continue
    for i in range(window_size, len(seq) - window_size):
        context = seq[i - window_size:i] + seq[i + 1:i + window_size + 1]
        target = seq[i]
        contexts.append(context)
        targets.append(target)

# Convert context and target lists to tensors
X = torch.tensor(contexts, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)

# Define the CBOW model in PyTorch
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):
        # x has shape (batch_size, context_size)
        embeds = self.embeddings(x)   # (batch, context_size, embedding_size)
        # Average the embeddings from context words
        avg_embeds = torch.mean(embeds, dim=1)
        logits = self.linear(avg_embeds)
        return logits

context_size = 2 * window_size
model = CBOW(vocab_size, embedding_size, context_size)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X)  # (batch_size, vocab_size)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    # Optionally print loss occasionally
    # if (epoch + 1) % 20 == 0:
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Extract the embeddings
embeddings = model.embeddings.weight.detach().numpy()

Epoch 1/100, Loss: 10.9313
Epoch 2/100, Loss: 10.8764
Epoch 3/100, Loss: 10.8221
Epoch 4/100, Loss: 10.7681
Epoch 5/100, Loss: 10.7141
Epoch 6/100, Loss: 10.6597
Epoch 7/100, Loss: 10.6048
Epoch 8/100, Loss: 10.5492
Epoch 9/100, Loss: 10.4926
Epoch 10/100, Loss: 10.4347
Epoch 11/100, Loss: 10.3755


KeyboardInterrupt: 

In [None]:
%matplotlib qt

In [11]:
# Perform PCA to reduce the dimensionality of the embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Visualize the embeddings (ignore index 0 as it is not assigned to a word)
plt.figure(figsize=(5, 5))

# Compute min and max values for normalization
x_coords = reduced_embeddings[:, 0]
y_coords = reduced_embeddings[:, 1]
x_min, x_max = np.min(x_coords), np.max(x_coords)
y_min, y_max = np.min(y_coords), np.max(y_coords)

for input_id, token in id_to_token.items():
    x_coord, y_coord = reduced_embeddings[input_id]
    # Normalize x and y coordinates to the range [0, 1]
    norm_x = (x_coord - x_min) / (x_max - x_min) if (x_max - x_min) != 0 else 0.5
    norm_y = (y_coord - y_min) / (y_max - y_min) if (y_max - y_min) != 0 else 0.5
    # Use normalized x and y to determine the color (red and green components)
    color = (norm_x, norm_y, 0.5)
    plt.scatter(x_coord, y_coord, c=[color])
    plt.annotate(token, xy=(x_coord, y_coord), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')

plt.title("Word Embeddings Visualized")
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.show()

In [6]:
import plotly.express as px
import pandas as pd

data = {
    'x': [],
    'y': [],
    'token': [],
}

for input_id, token in id_to_token.items():
    x_coord, y_coord = reduced_embeddings[input_id]
    data['x'].append(x_coord)
    data['y'].append(y_coord)
    data['token'].append(token)

df = pd.DataFrame(data)

fig = px.scatter(df, x='x', y='y', text='token', title="Word Embeddings Visualized")
fig.update_traces(textposition='top center')
fig.show()