In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter


vocab = { # vocabulary mapping words to indices
    "Hello": 72,
    "my": 44,
    "name": 21,
    "is": 93,
    "Jacob": 11,
}

sentence = ["Hello", "my", "name", "is", "Jacob"]

# Reverse vocab to map indices to words (for debugging)
idx_to_word = {idx: word for word, idx in vocab.items()}

# Convert sentence to indices
sentence_idx = [vocab[word] for word in sentence]

# Create training pairs (context -> target)
half_win_size = 1
data = []
for i in range(half_win_size, len(sentence_idx) - half_win_size):
    context = (
        sentence_idx[i - half_win_size : i] 
        + sentence_idx[i + 1 : i + half_win_size + 1]
    )
    target = sentence_idx[i]
    data.append((context, target))

print("Training pairs (context -> target):")
for context, target in data:
    print(f"{[idx_to_word[c] for c in context]} -> {idx_to_word[target]}")

Training pairs (context -> target):
['Hello', 'name'] -> my
['my', 'is'] -> name
['name', 'Jacob'] -> is


In [2]:
def generate_cbow_data(tokens, window_size):
    """
    Generate CBOW (context, target) pairs from a list of token indices.
    Args:
        tokens: List of token indices.
        window_size: Number of context words on each side.
    Returns:
        data: List of (context, target) pairs.
    """
    data = []
    for i in range(window_size, len(tokens) - window_size):
        context = (
            tokens[i - window_size : i] +
            tokens[i + 1 : i + window_size + 1]
        )
        target = tokens[i]
        data.append((context, target))
    return data

# Example Usage:
generate_cbow_data(sentence_idx, 1)

[([72, 21], 44), ([44, 93], 21), ([21, 11], 93)]

In [3]:
with open("text8", "r", encoding="utf-8") as f:
    text = f.read().strip()  # Read the entire file as a single string

# Tokenize into words (text8 is already space-separated)
words = text.split()  # List of words (all lowercase, no punctuation)

# Count word frequencies (optional, useful for limiting vocab size)
word_counts = Counter(words)
# word_counts

In [4]:
def build_vocab_and_tokens(corpus, vocab_size=10000):
    """
    Build vocabulary and convert corpus to tokens.
    Args:
        corpus: List of words (strings).
        vocab_size: Maximum vocabulary size (including <UNK>).
    Returns:
        tokens: List of token indices.
        vocab: Dict mapping word -> index.
        idx_to_word: Dict mapping index -> word.
    """
    from collections import Counter

    word_counts = Counter(corpus)
    most_common = word_counts.most_common(vocab_size - 1)  # -1 for UNK
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}
    vocab["<UNK>"] = 0
    idx_to_word = {idx: word for word, idx in vocab.items()}
    tokens = [vocab.get(word, 0) for word in corpus]
    return tokens, vocab, idx_to_word

# Example usage:
tokens, vocab, idx_to_word = build_vocab_and_tokens(words, vocab_size=10000)
print("Sample tokens:", tokens[:10])
print("Sample words:", [idx_to_word[t] for t in tokens[:10]])

Sample tokens: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
Sample words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [5]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embs = self.embeddings(inputs)
        embs = torch.mean(embs, dim=1)
        out = self.linear(embs)
        probs = F.log_softmax(out, dim=1)
        return probs
    
    def training_step(self, context, target):
        """
        Args:
            context: List of word indices (e.g., [72, 44, 93, 11])
            target: Target word index (e.g., 21)
        Returns:
            loss: Tensor with gradients attached
        """
        context = torch.tensor([context], dtype=torch.long)
        target = torch.tensor([target], dtype=torch.long).view(-1)
        output = self(context)
        loss = F.nll_loss(output, target)
        return loss

In [6]:
# Example usage:
window_size = 1  # context of 1 word on each side
demo_words = generate_cbow_data(sentence, window_size) # Do a quick demo with sentence to show it works (in practice this should use the indices not the words)
print("Example CBOW pairs (as token indices):", demo_words[:5])

Example CBOW pairs (as token indices): [(['Hello', 'name'], 'my'), (['my', 'is'], 'name'), (['name', 'Jacob'], 'is')]


In [7]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_data):
        self.data = cbow_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

def train_cbow(
    model,
    cbow_data,
    idx_to_word,
    batch_size=128,
    num_epochs=5,
    lr=0.01,
    print_every=1000, # print every n steps
    device="cpu"
):
    dataset = CBOWDataset(cbow_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for step, (contexts, targets) in enumerate(dataloader):
            contexts, targets = contexts.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(contexts)
            loss = torch.nn.functional.nll_loss(output, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if (step + 1) % print_every == 0:
                # Print a sample prediction for visualization
                with torch.no_grad():
                    pred_idx = output.argmax(dim=1)[0].item()
                    pred_word = idx_to_word.get(pred_idx, str(pred_idx))
                    context_words = [idx_to_word.get(c.item(), str(c.item())) for c in contexts[0]]
                    target_word = idx_to_word.get(targets[0].item(), str(targets[0].item()))
                    print(f"Epoch {epoch+1} Step {step+1}: Context: {context_words} -> Target: {target_word} | "
                          f"Predicted: {pred_word} | Loss: {loss.item():.4f}")

        print(f"Epoch {epoch+1} complete. Avg loss: {total_loss / len(dataloader):.4f}")

In [8]:
vocab_size=5000
num_epochs = 10
embedding_dim = 10
window_size = 4  # context of 4 words on each side

tokens, vocab, idx_to_word = build_vocab_and_tokens(words, vocab_size=vocab_size)
cbow_data = generate_cbow_data(tokens, window_size)

# Training
model = CBOW(vocab_size, embedding_dim)
train_cbow(model, cbow_data, idx_to_word, batch_size=128, num_epochs=5, lr=0.01)

Epoch 1 Step 1000: Context: ['after', 'the', 'removal', 'of', 'mother', 'by', 'peter', 'the'] -> Target: his | Predicted: <UNK> | Loss: 8.1623
Epoch 1 Step 2000: Context: ['be', 'accomplished', 'when', 'a', 's', 'license', 'is', '<UNK>'] -> Target: driver | Predicted: <UNK> | Loss: 7.6547
Epoch 1 Step 3000: Context: ['do', 'not', 'call', 'themselves', 'but', 'are', 'related', 'to'] -> Target: <UNK> | Predicted: <UNK> | Loss: 7.5675
Epoch 1 Step 4000: Context: ['a', 'single', '<UNK>', '<UNK>', 'only', 'be', 'parallel', '<UNK>'] -> Target: can | Predicted: <UNK> | Loss: 7.3775
Epoch 1 Step 5000: Context: ['influence', 'on', '<UNK>', 'and', 'philosophy', 'of', 'religion', 'on'] -> Target: contemporary | Predicted: <UNK> | Loss: 6.9612
Epoch 1 Step 6000: Context: ['of', 'apollo', 'and', 'to', 'apollo', 'the', 'god', 'of'] -> Target: challenge | Predicted: <UNK> | Loss: 7.0594
Epoch 1 Step 7000: Context: ['nine', 'six', 'the', 'first', 'of', 'la', '<UNK>', '<UNK>'] -> Target: publication | 

KeyboardInterrupt: 