In [21]:
# 假设我们有以下电影评论
reviews = [
    "The movie was fantastic!",
    "I loved the plot and the acting.",
    "The film was a disaster.",
    "The worst movie I've ever seen."
]

# 分词
tokenized_reviews = [review.split() for review in reviews]

# 构建词汇表
vocab = set(word for sentence in tokenized_reviews for word in sentence)
vocab_size = len(vocab)
print("构建的词汇表大小：", vocab_size)
print(vocab)

构建的词汇表大小： 17
{'plot', 'loved', 'film', 'a', 'I', 'disaster.', 'seen.', 'fantastic!', 'was', 'movie', 'worst', "I've", 'ever', 'The', 'and', 'acting.', 'the'}


In [22]:
import torch
import torch.nn as nn

embedding_dim = 5

# 创建嵌入层
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

In [23]:
# 将词汇表中的单词映射到索引 ( 哈希表 )
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
print("把词汇表中的单词映射到索引 :")
print(word_to_idx)

# 将一个评论转换为索引序列
def text_to_indices(text):
    return [word_to_idx[word] for word in text.split()]

# 将索引序列转换为嵌入向量
def get_embedding(text):
    indices = text_to_indices(text)
    print("转化评论为向量组 :")
    print("评论 : ", text)
    print("索引序列 : ", indices)

    indices_tensor = torch.tensor(indices, dtype=torch.long)
    print("转化序列为张量 : ", indices_tensor.shape)

    # 获取到了代表关联程度的嵌入向量
    embedding_vector = embedding(indices_tensor)

    return embedding_vector

# 测试一个评论
test_review = "The movie was fantastic!"
embedding_vector = get_embedding(test_review)
# 4 个单词，每个单词对应一个 5 维的嵌入向量
print("嵌入向量 : ", embedding_vector)

把词汇表中的单词映射到索引 :
{'plot': 0, 'loved': 1, 'film': 2, 'a': 3, 'I': 4, 'disaster.': 5, 'seen.': 6, 'fantastic!': 7, 'was': 8, 'movie': 9, 'worst': 10, "I've": 11, 'ever': 12, 'The': 13, 'and': 14, 'acting.': 15, 'the': 16}
转化评论为向量组 :
评论 :  The movie was fantastic!
索引序列 :  [13, 9, 8, 7]
转化序列为张量 :  torch.Size([4])
嵌入向量 :  tensor([[-1.2346,  1.1077, -0.4591, -1.4099, -0.0587],
        [-0.5881,  0.1115, -0.0706,  1.9341, -1.1581],
        [-1.0547,  2.1699,  1.5870, -0.8900, -0.4487],
        [-0.3993, -1.9752,  0.1455,  0.8750, -0.1282]],
       grad_fn=<EmbeddingBackward0>)
