In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter

In [11]:
# 1. 数据准备
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "the cat and the dog are friends",
]

In [22]:
# 2. 构建词汇表
tokenized_corpus = [sentence.split() for sentence in corpus]
word_counts = Counter(word for sentence in tokenized_corpus for word in sentence) # 每个单词以及出现的次数
vocab = {word: i for i, word in enumerate(word_counts.keys())} # 单词到索引的映射
vocab_size = len(vocab) # 词汇表大小

In [None]:
# 3. 生成 (中心词, 上下文词) 训练样本
window_size = 2
training_data = []

for sentence in tokenized_corpus:
    for i, center_word in enumerate(sentence):
        center_idx = vocab[center_word]
        for j in range(-window_size, window_size + 1):
            if j != 0 and 0 <= i + j < len(sentence):
                context_word = sentence[i + j]
                training_data.append((center_idx, vocab[context_word])) # (中心词索引, 上下文词索引)

training_data

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 0),
 (3, 1),
 (3, 2),
 (3, 0),
 (3, 4),
 (0, 2),
 (0, 3),
 (0, 4),
 (4, 3),
 (4, 0),
 (0, 5),
 (0, 2),
 (5, 0),
 (5, 2),
 (5, 3),
 (2, 0),
 (2, 5),
 (2, 3),
 (2, 0),
 (3, 5),
 (3, 2),
 (3, 0),
 (3, 6),
 (0, 2),
 (0, 3),
 (0, 6),
 (6, 3),
 (6, 0),
 (0, 1),
 (0, 7),
 (1, 0),
 (1, 7),
 (1, 0),
 (7, 0),
 (7, 1),
 (7, 0),
 (7, 5),
 (0, 1),
 (0, 7),
 (0, 5),
 (0, 8),
 (5, 7),
 (5, 0),
 (5, 8),
 (5, 9),
 (8, 0),
 (8, 5),
 (8, 9),
 (9, 5),
 (9, 8)]

In [24]:
# 4. Skip-gram 模型定义
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word):
        center_embedding = self.embeddings(center_word)
        scores = self.output_layer(center_embedding)
        return scores

In [25]:
# 5. 训练模型
embedding_dim = 10
model = SkipGram(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [26]:
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for center, context in training_data:
        center_tensor = torch.tensor([center], dtype=torch.long)
        context_tensor = torch.tensor([context], dtype=torch.long)

        optimizer.zero_grad()
        output = model(center_tensor)
        loss = criterion(output, context_tensor)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

# 6. 输出词向量
word_vectors = model.embeddings.weight.data.numpy()
for word, idx in vocab.items():
    print(f"Word: {word}, Vector: {word_vectors[idx]}")

Epoch 20/100, Loss: 92.1898
Epoch 40/100, Loss: 91.0246
Epoch 60/100, Loss: 90.6198
Epoch 80/100, Loss: 90.3839
Epoch 100/100, Loss: 90.2198
Word: the, Vector: [-0.7219838  -0.06475896 -0.7327847  -0.08595231 -1.6121358  -0.08662665
 -0.23153545  0.04500464  0.4962679  -1.0625919 ]
Word: cat, Vector: [ 1.213518    2.2492442   1.3218019   0.62236327 -0.0303225   0.94328797
  0.08740252 -0.7029813   0.8023205  -0.09477841]
Word: sat, Vector: [ 2.9422662  -0.27853212 -0.46484956  0.41288215 -0.67734915 -0.04913354
  1.574669   -0.49794376  0.3392137   0.6131823 ]
Word: on, Vector: [ 0.92557365 -0.8099594  -0.17121041 -0.52117527  2.0382535   0.34503347
 -1.3556825  -0.7853026   0.36183363 -0.10263178]
Word: mat, Vector: [ 0.87801206 -0.47459403  1.5125083   3.232189    1.2623043   0.11753491
  2.4696615   1.2696218   0.35343483  0.92886215]
Word: dog, Vector: [-0.39919415  1.8964784   0.9231417   0.9305452   0.45523068 -0.6378382
 -0.15911524  0.02091936 -0.26834136  1.16201   ]
Word: log