In [1]:
# Skip-gram算法实现

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# 超参数
embedding_dim = 10  # 词向量维度
learning_rate = 0.01
num_epochs = 100

In [3]:
# 构造简单的词汇表和数据
corpus = ["we like deep learning", "we love NLP", "word embedding is fun"]

In [4]:
# 构造词汇表
word_list = set(" ".join(corpus).split())  # 去重后的单词集合
word2idx = {word: idx for idx, word in enumerate(word_list)} # 单词到索引的映射
idx2word = {idx: word for word, idx in word2idx.items()} # 索引到单词的映射
vocab_size = len(word2idx) # 词汇表大小

In [5]:
# 生成 Skip-gram 训练数据
window_size = 2
skip_gram_data = []

for sentence in corpus:
    words = sentence.split()
    for idx, center_word in enumerate(words):
        for offset in range(-window_size, window_size + 1):
            if offset == 0 or idx + offset < 0 or idx + offset >= len(words):
                continue
            context_word = words[idx + offset]
            skip_gram_data.append((word2idx[center_word], word2idx[context_word])) # 格式为(中心词索引, 上下文词索引)

# 转换为 Tensor
skip_gram_data = torch.tensor(skip_gram_data)
skip_gram_data

tensor([[0, 4],
        [0, 6],
        [4, 0],
        [4, 6],
        [4, 8],
        [6, 0],
        [6, 4],
        [6, 8],
        [8, 4],
        [8, 6],
        [0, 9],
        [0, 1],
        [9, 0],
        [9, 1],
        [1, 0],
        [1, 9],
        [5, 2],
        [5, 7],
        [2, 5],
        [2, 7],
        [2, 3],
        [7, 5],
        [7, 2],
        [7, 3],
        [3, 2],
        [3, 7]])

In [6]:
# 定义 Skip-gram 模型
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.WI = nn.Embedding(vocab_size, embedding_dim)  # 中心词向量空间 (V x N)
        self.WO = nn.Embedding(vocab_size, embedding_dim)  # 上下文词向量空间 (V x N)

    def forward(self, center_idx):
        v_c = self.WI(center_idx)  # (batch_size, embedding_dim)
        scores = torch.matmul(v_c, self.WO.weight.T)  # (batch_size, vocab_size)
        return scores
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SkipGram(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [7]:
# 开始训练
for epoch in range(num_epochs):
    total_loss = 0
    for center, context in skip_gram_data:
        center, context = center.to(device), context.to(device)

        optimizer.zero_grad()
        scores = model(center.unsqueeze(0))  # 增加 batch 维度
        loss = criterion(scores, context.unsqueeze(0))  # 计算损失
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# 打印训练后的词向量
print("\nLearned word vectors:")
for word, idx in word2idx.items():
    print(f"{word}: {model.WI.weight[idx].detach().numpy()}")

Epoch 0, Loss: 164.4450
Epoch 10, Loss: 37.0246
Epoch 20, Loss: 29.0041
Epoch 30, Loss: 27.7177
Epoch 40, Loss: 27.2980
Epoch 50, Loss: 27.1040
Epoch 60, Loss: 26.9948
Epoch 70, Loss: 26.9253
Epoch 80, Loss: 26.8769
Epoch 90, Loss: 26.8411

Learned word vectors:
we: [-0.39432     0.8618684   0.9984683   1.9259332   0.47502545 -0.37072268
 -1.118236    0.03735328 -0.27072257 -2.4090092 ]
NLP: [-1.4238187  -1.2575887   0.71602035 -0.04907556  2.4831998  -0.80715996
 -0.54783607 -0.2558084  -1.4790802   0.27846402]
embedding: [-0.07708196  0.52656186  0.3533564  -1.1550801  -2.9387343   0.25311458
  0.2802841  -0.01401285 -0.71019435  0.07720388]
fun: [ 0.24812387  0.19299272  0.41594127 -2.0790749   0.13417944 -0.10976417
 -2.0473742  -1.6911982  -0.5540867   0.73592556]
like: [ 0.09149608 -1.3952626   0.36846802  2.6559212   0.3489073   0.9896139
  0.7166462  -1.6571385  -1.2075752   0.23042937]
word: [ 0.5310363  -0.38897008  0.49036792 -0.65369684 -1.6652849  -0.24581239
 -1.5820827  

In [8]:
# 获取中心词的索引
center_word = 'we'
center_idx = word2idx[center_word]
center_tensor = torch.tensor([center_idx]).to(device)

# 使用模型进行预测
with torch.no_grad():
    scores = model(center_tensor)
    predicted_context_idx = torch.argmax(scores, dim=1).item()
    predicted_context_word = idx2word[predicted_context_idx]

print(f'Given center word: {center_word}')
print(f'Predicted context word: {predicted_context_word}')

Given center word: we
Predicted context word: NLP


In [9]:
# CBOW算法实现

In [10]:
# 语料库
corpus = ["The cat sits on the mat"]
words = corpus[0].split()  # 分词
word2idx = {word: i for i, word in enumerate(set(words))}  # 词汇表
idx2word = {i: word for word, i in word2idx.items()}  # 词汇表

In [11]:
# 生成 CBOW 训练数据
window_size = 2
cbow_data = []

for sentence in corpus:
    words = sentence.split()
    for idx, center_word in enumerate(words):
        context_words = []
        for offset in range(-window_size, window_size + 1):
            if offset == 0 or idx + offset < 0 or idx + offset >= len(words):
                continue
            context_words.append(word2idx[words[idx + offset]])  # 记录上下文词索引
        if context_words:  # 避免空列表
            cbow_data.append((context_words, word2idx[center_word]))  # (上下文词索引列表, 目标词索引)

# 转换为 Tensor
cbow_data = [(torch.tensor(context), torch.tensor(target)) for context, target in cbow_data]
cbow_data

[(tensor([2, 4]), tensor(0)),
 (tensor([0, 4, 1]), tensor(2)),
 (tensor([0, 2, 1, 5]), tensor(4)),
 (tensor([2, 4, 5, 3]), tensor(1)),
 (tensor([4, 1, 3]), tensor(5)),
 (tensor([1, 5]), tensor(3))]

In [12]:
# 定义 CBOW 模型
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.WI = nn.Embedding(vocab_size, embedding_dim)  # 上下文词向量空间 (V x N)
        self.WO = nn.Embedding(vocab_size, embedding_dim)  # 中心词向量空间 (V x N)

    def forward(self, context_idxs):
        context_embeddings = self.WI(context_idxs)
        average_emb = torch.mean(context_embeddings, dim=0, keepdim=True)
        scores = torch.matmul(average_emb, self.WO.weight.T)
        return scores
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
# 开始训练
for epoch in range(num_epochs):
    total_loss = 0
    for context, center in cbow_data:
        context, center = context.to(device), center.to(device)

        optimizer.zero_grad()
        scores = model(context)  # 增加 batch 维度
        loss = criterion(scores, center.unsqueeze(0))  # 计算损失
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# 打印训练后的词向量
print("\nLearned word vectors:")
for word, idx in word2idx.items():
    print(f"{word}: {model.WI.weight[idx].detach().numpy()}")

Epoch 0, Loss: 13.0367
Epoch 10, Loss: 5.9836
Epoch 20, Loss: 3.2476
Epoch 30, Loss: 1.6222
Epoch 40, Loss: 0.8533
Epoch 50, Loss: 0.5131
Epoch 60, Loss: 0.3419
Epoch 70, Loss: 0.2445
Epoch 80, Loss: 0.1838
Epoch 90, Loss: 0.1433

Learned word vectors:
The: [ 0.8771341  0.6165877 -1.2195442  0.0255321 -1.1571873  3.2178388
 -1.4898465  0.1290686 -0.4132271  1.014745 ]
on: [ 0.04589905  0.24310425 -0.11267089 -0.24023066 -2.347304   -0.18171461
 -1.3725036  -0.13852957  0.8322103  -0.47539952]
cat: [ 0.27115706  1.0187441   0.611658   -1.5865067   0.87147933  0.5577066
  0.62968713 -0.3058953   0.62642515  0.26221675]
mat: [-1.1161038  -0.7870448   1.2830974   2.3960016  -0.24700347  2.0023594
 -1.1109082  -2.7715046  -0.09689652  1.1125424 ]
sits: [-0.89836186  0.99905235  0.7934982  -0.5487706   0.22069451  0.47899535
  0.6576656  -1.7366772  -2.9801214  -0.66734016]
the: [ 2.212858   -1.3035347   1.319773    1.3719721   0.6784411  -0.11297243
  1.7667874   2.1787608  -1.1562164   1.2

In [14]:
# 获取上下文词的索引
context_words = ['sits', 'on']
context_idxs = [word2idx[word] for word in context_words]
context_tensor = torch.tensor([context_idxs]).to(device)

# 使用模型进行预测
with torch.no_grad():
    scores = model(context_tensor.squeeze())
    predicted_center_idx = torch.argmax(scores, dim=1).item()
    predicted_center_word = idx2word[predicted_center_idx]

print(f'Given context words: {context_words}')
print(f'Predicted center word: {predicted_center_word}')

Given context words: ['sits', 'on']
Predicted center word: cat
