In [1]:
# Item2Vec是一种基于Word2Vec的物品嵌入学习方法，主要用于推荐系统中学习物品的低维向量表示。
# 核心思想是将物品看作单词，用户的交互序列看作句子，然后使用Word2Vec来学习物品之间的关系。

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# 1. 模拟用户交互数据
corpus = [
    ["iPhone", "MacBook", "AirPods"],
    ["PS5", "Switch", "Xbox"],
    ["MacBook", "iPad", "Magic Keyboard"],
    ["iPhone", "iPad", "Apple Watch"],
    ["Xbox", "GamePass", "Halo"]
]

In [4]:
# 2. 创建词典
word2idx = {}
idx2word = {}
idx = 0
for sentence in corpus:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = idx
            idx2word[idx] = word
            idx += 1

vocab_size = len(word2idx)
window_size = 1  # 只考虑相邻的物品

In [5]:
# 3. 生成 Skip-gram 训练数据
training_data = []
for sentence in corpus:
    indices = [word2idx[word] for word in sentence]
    for center_idx, center_word in enumerate(indices):
        for offset in range(-window_size, window_size + 1):
            context_idx = center_idx + offset
            if offset == 0 or context_idx < 0 or context_idx >= len(indices):
                continue
            training_data.append((center_word, indices[context_idx])) # (center, context)

training_data = torch.tensor(training_data)
training_data

tensor([[ 0,  1],
        [ 1,  0],
        [ 1,  2],
        [ 2,  1],
        [ 3,  4],
        [ 4,  3],
        [ 4,  5],
        [ 5,  4],
        [ 1,  6],
        [ 6,  1],
        [ 6,  7],
        [ 7,  6],
        [ 0,  6],
        [ 6,  0],
        [ 6,  8],
        [ 8,  6],
        [ 5,  9],
        [ 9,  5],
        [ 9, 10],
        [10,  9]])

In [6]:
# 4. 定义 Item2Vec (Skip-gram) 模型
class Item2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Item2Vec, self).__init__()
        self.WI = nn.Embedding(vocab_size, embedding_dim)  # 中心词向量空间 (V x N)
        self.WO = nn.Embedding(vocab_size, embedding_dim)  # 上下文词向量空间 (V x N)

    def forward(self, center_idx):
        v_c = self.WI(center_idx)  # (batch_size, embedding_dim)
        scores = torch.matmul(v_c, self.WO.weight.T)  # (batch_size, vocab_size)
        return scores

embedding_dim = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Item2Vec(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [7]:
# 5. 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for center, context in training_data:
        center, context = center.to(device), context.to(device)

        optimizer.zero_grad()
        scores = model(center.unsqueeze(0))  # 增加 batch 维度
        loss = criterion(scores, context.unsqueeze(0))  # 计算损失
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# 打印训练后的词向量
print("\nLearned word vectors:")
for word, idx in word2idx.items():
    print(f"{word}: {model.WI.weight[idx].detach().numpy()}")


Epoch 0, Loss: 101.7523
Epoch 10, Loss: 21.5140
Epoch 20, Loss: 16.8474
Epoch 30, Loss: 16.1054
Epoch 40, Loss: 15.8451
Epoch 50, Loss: 15.7187
Epoch 60, Loss: 15.6448
Epoch 70, Loss: 15.5957
Epoch 80, Loss: 15.5602
Epoch 90, Loss: 15.5326

Learned word vectors:
iPhone: [ 0.10257793  2.2753115   1.089999    1.1591762   0.78177196 -0.73553616
  0.02159385 -0.02105704 -0.20757978  1.1321979 ]
MacBook: [ 1.0998651  -0.428761   -0.7472727  -0.02003135  1.7789153   0.68190455
  1.84901    -0.5654099  -0.37160477 -0.14102161]
AirPods: [-1.392579    0.07051965  1.8836502   1.2164317   0.73689806  0.5570067
  0.13493632 -2.1850996  -1.3617554   0.5889098 ]
PS5: [-0.1674017   1.8469607   2.5034945  -2.3784306  -0.8938814   0.19561696
  0.5336491   2.4609773  -0.68164474 -0.14437471]
Switch: [-0.15312459  1.8753009  -0.57060003 -1.8458759  -1.3570073   0.30829924
 -1.2800428  -1.4402593   0.29729474 -0.29711217]
Xbox: [ 1.5762222   2.3378627   0.2958379  -0.59735054  1.2043834   0.01723105
 -0.4

In [8]:
def get_similar_items(item, top_k=5):
    if item not in word2idx:
        return f"Item '{item}' not found in the vocabulary."
    
    item_idx = word2idx[item]
    item_vector = model.WI.weight[item_idx].detach().numpy().reshape(1, -1)
    
    similarities = {}
    for other_item, other_idx in word2idx.items():
        if other_item == item:
            continue
        other_vector = model.WI.weight[other_idx].detach().numpy().reshape(1, -1)
        similarity = cosine_similarity(item_vector, other_vector)[0][0]
        similarities[other_item] = similarity
    
    similar_items = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return similar_items

# 测试一些物品
test_items = ["iPhone", "PS5", "MacBook"]
for item in test_items:
    print(f"Items similar to '{item}':")
    similar_items = get_similar_items(item)
    for similar_item, similarity in similar_items:
        print(f"  {similar_item}: {similarity:.4f}")
    print()

Items similar to 'iPhone':
  Xbox: 0.3999
  AirPods: 0.3779
  PS5: 0.2122
  Apple Watch: 0.1910
  iPad: 0.1637

Items similar to 'PS5':
  Xbox: 0.5001
  iPhone: 0.2122
  Switch: 0.1921
  GamePass: 0.0297
  Magic Keyboard: -0.0643

Items similar to 'MacBook':
  iPad: 0.3937
  Apple Watch: 0.2109
  Magic Keyboard: 0.1723
  Xbox: 0.1485
  AirPods: 0.0510

