In [2]:
## 词表

from collections import defaultdict, Counter

class Vocab:
    def __init__(self, tokens=None):
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens = tokens + ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']

    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, token):
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]


def save_vocab(vocab, path):
    with open(path, 'w') as writer:
        writer.write("\n".join(vocab.idx_to_token))


def read_vocab(path):
    with open(path, 'r') as f:
        tokens = f.read().split('\n')
    return Vocab(tokens)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader

# Constants
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"

WEIGHT_INIT_RANGE = 0.1

def load_reuters():
    from nltk.corpus import reuters
    text = reuters.sents()
    # lowercase (optional)
    text = [[word.lower() for word in sentence] for sentence in text]
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]

    return corpus, vocab

def save_pretrained(vocab, embeds, save_path):
    """
    Save pretrained token vectors in a unified format, where the first line
    specifies the `number_of_tokens` and `embedding_dim` followed with all
    token vectors, one token per line.
    """
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")
    print(f"Pretrained embeddings saved to: {save_path}")

def load_pretrained(load_path):
    with open(load_path, "r") as fin:
        # Optional: depending on the specific format of pretrained vector file
        n, d = map(int, fin.readline().split())
        tokens = []
        embeds = []
        for line in fin:
            line = line.rstrip().split(' ')
            token, embed = line[0], list(map(float, line[1:]))
            tokens.append(token)
            embeds.append(embed)
        vocab = Vocab(tokens)
        embeds = torch.tensor(embeds, dtype=torch.float)
    return vocab, embeds

def get_loader(dataset, batch_size, shuffle=True):
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=dataset.collate_fn,
        shuffle=shuffle
    )
    return data_loader

def init_weights(model):
    for name, param in model.named_parameters():
        if "embedding" not in name:
            torch.nn.init.uniform_(
                param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
            )

## CBOW模型

In [9]:
## 数据
class CbowDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]

            # 如果句子长度不足以构建（上下文+目标词）训练样本，则跳过
            if len(sentence) < context_size * 2 + 1:
                continue
            
            for i in range(context_size, len(sentence) - context_size):
                # 模型输入：左右各分别取context_size长度的上下文
                context = sentence[i-context_size:i] + sentence[i+1:i+context_size+1]
                # 模型输出：当前词
                target = sentence[i]
                self.data.append((context, target))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        inputs = torch.tensor([ex[0] for ex in examples])
        targets = torch.tensor([ex[1] for ex in examples])
        return (inputs, targets)

In [10]:
## 模型
class CbowModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CbowModel, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim) # 词向量层
        self.output = nn.Linear(embedding_dim, vocab_size, bias=False) # 输出层
    
    
    def forward(self, inputs):
        
        embeds = self.embeddings(inputs)
        hidden = embeds.mean(dim=1) # 计算隐含层：对上下文词向量求平均
        output = self.output(hidden)
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

In [11]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据，构建CBOW模型训练数据集
corpus, vocab = load_reuters()
dataset = CbowDataset(corpus, vocab, context_size=context_size)
data_loader = get_loader(dataset, batch_size)

nll_loss = nn.NLLLoss() # torch.nn.CrossEntropyLoss相当于softmax + log + nllloss
# 构建CBOW模型，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CbowModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 保存词向量（model.embeddings）
save_pretrained(vocab, model.embeddings.weight.data, "cbow.vec")

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 10278.96


Training Epoch 1:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 8293.46


Training Epoch 2:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 7738.86


Training Epoch 3:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 7404.24


Training Epoch 4:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 7165.17


Training Epoch 5:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 6978.62


Training Epoch 6:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 6825.71


Training Epoch 7:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 6696.30


Training Epoch 8:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 6584.16


Training Epoch 9:   0%|          | 0/1574 [00:00<?, ?it/s]

Loss: 6485.60
Pretrained embeddings saved to: cbow.vec


## SKip-gram模型

In [12]:
class SkipGramDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence)-1):
                # 模型输入：当前词
                w = sentence[i]
                # 模型输出：一定窗口大小内的上下文
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
                self.data.extend([(w, c) for c in context])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        inputs = torch.tensor([ex[0] for ex in examples])
        targets = torch.tensor([ex[1] for ex in examples])
        return (inputs, targets)

In [13]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
        init_weights(self)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        output = self.output(embeds)
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

In [14]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据，构建Skip-gram模型训练数据集
corpus, vocab = load_reuters()
dataset = SkipGramDataset(corpus, vocab, context_size=context_size)
data_loader = get_loader(dataset, batch_size)

nll_loss = nn.NLLLoss()
# 构建Skip-gram模型，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SkipGramModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 保存词向量（model.embeddings）
save_pretrained(vocab, model.embeddings.weight.data, "skipgram.vec")

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 43176.32


Training Epoch 1:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 40015.46


Training Epoch 2:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 39360.70


Training Epoch 3:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38967.35


Training Epoch 4:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38692.05


Training Epoch 5:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38484.48


Training Epoch 6:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38317.85


Training Epoch 7:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38181.46


Training Epoch 8:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 38065.71


Training Epoch 9:   0%|          | 0/6616 [00:00<?, ?it/s]

Loss: 37965.89
Pretrained embeddings saved to: skipgram.vec


## 基于负采样的Skip-gram模型

In [15]:
class SGNSDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):

        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]

        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence) - 1):
                # 模型输入：(w, context) ；输出为0/1，表示context是否为负样本
                w = sentence[i]
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
                context += [self.pad] * (2 * context_size - len(context))
                self.data.append((w, context))

        # 负样本数量
        self.n_negatives = n_negatives
        # 负采样分布：若参数ns_dist为None，则使用uniform分布
        self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
        contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
        batch_size, context_size = contexts.shape
        neg_contexts = []

        # 对batch内的样本分别进行负采样
        for i in range(batch_size):
            # 保证负样本不包含当前样本中的context
            ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) # 填0防止抽到自己
            neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) # 有放回的采样，input中值为0的元素永远不会被抽到
        neg_contexts = torch.stack(neg_contexts, dim=0) # list包tensor，变为tensor包list
        return words, contexts, neg_contexts

In [16]:
class SGNSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SGNSModel, self).__init__()
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim) # 词嵌入
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim) # 上下文嵌入

    def forward_w(self, words):
        w_embeds = self.w_embeddings(words)
        return w_embeds

    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        return c_embeds

In [17]:
def get_unigram_distribution(corpus, vocab_size):
    # 从给定语料中统计unigram概率分布
    token_counts = torch.tensor([0] * vocab_size)
    total_count = 0
    for sentence in corpus:
        total_count += len(sentence)
        for token in sentence:
            token_counts[token] += 1
    unigram_dist = torch.div(token_counts.float(), total_count) # torch.div 张量和标量做逐元素除法
    return unigram_dist

In [20]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10

# 读取文本数据
corpus, vocab = load_reuters()
# 计算unigram概率分布
unigram_dist = get_unigram_distribution(corpus, len(vocab))

In [21]:
# 根据unigram分布计算负采样分布: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum()
# 构建SGNS训练数据集
dataset = SGNSDataset(
    corpus,
    vocab,
    context_size=context_size,
    n_negatives=n_negatives,
    ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, neg_contexts = [x.to(device) for x in batch]
        optimizer.zero_grad()
        batch_size = words.shape[0]
        # 提取batch内词、上下文以及负样本的向量表示
        word_embeds = model.forward_w(words).unsqueeze(dim=2)
        context_embeds = model.forward_c(contexts)
        neg_context_embeds = model.forward_c(neg_contexts)
        # 正样本的分类（对数）似然
        context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
        context_loss = context_loss.mean(dim=1)
        # 负样本的分类（对数）似然
        neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
        neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
        neg_context_loss = neg_context_loss.mean(dim=1)
        # 损失：负对数似然
        loss = -(context_loss + neg_context_loss).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 合并词嵌入矩阵与上下文嵌入矩阵，作为最终的预训练词向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")

Training Epoch 0:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 32961.16


Training Epoch 1:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 11422.06


Training Epoch 2:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 7378.01


Training Epoch 3:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6034.89


Training Epoch 4:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 5376.46


Training Epoch 5:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 4983.83


Training Epoch 6:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 4725.54


Training Epoch 7:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 4541.41


Training Epoch 8:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 4403.10


Training Epoch 9:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 4297.14
Pretrained embeddings saved to: sgns.vec


## GloVe词向量

In [4]:
## 数据
class GloveDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        # 记录词与上下文在给定语料中的共现次数
        self.cooccur_counts = defaultdict(float)
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]

            for i in range(1, len(sentence)-1):
                w = sentence[i]
                left_contexts = sentence[max(0, i - context_size):i]
                right_contexts = sentence[i+1:min(len(sentence), i + context_size)+1]
                # 共现次数随距离衰减: 1/d(w, c)
                for k, c in enumerate(left_contexts[::-1]):
                    self.cooccur_counts[(w, c)] += 1 / (k + 1) # k + 1距中心词的距离
                for k, c in enumerate(right_contexts):
                    self.cooccur_counts[(w, c)] += 1 / (k + 1)
        self.data = [(w, c, count) for (w, c), count in self.cooccur_counts.items()]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        words = torch.tensor([ex[0] for ex in examples])
        contexts = torch.tensor([ex[1] for ex in examples])
        counts = torch.tensor([ex[2] for ex in examples])
        return (words, contexts, counts)

In [25]:
## 模型
class GloveModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloveModel, self).__init__()
        # 词嵌入及偏置向量
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.w_biases = nn.Embedding(vocab_size, 1)
        # 上下文嵌入及偏置向量
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.c_biases = nn.Embedding(vocab_size, 1)

    def forward_w(self, words):
        w_embeds = self.w_embeddings(words)
        w_biases = self.w_biases(words)
        return w_embeds, w_biases

    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        c_biases = self.c_biases(contexts)
        return c_embeds, c_biases

In [6]:
embedding_dim = 64
context_size = 2
batch_size = 1024
num_epoch = 10

# 用以控制样本权重的超参数
m_max = 100
alpha = 0.75
# 从文本数据中构建GloVe训练数据集
corpus, vocab = load_reuters()
dataset = GloveDataset(
    corpus,
    vocab,
    context_size=context_size
)

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

In [26]:
data_loader = get_loader(dataset, batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GloveModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, counts = [x.to(device) for x in batch]
        # 提取batch内词、上下文的向量表示及偏置
        word_embeds, word_biases = model.forward_w(words)
        context_embeds, context_biases = model.forward_c(contexts)
        # 回归目标值：必要时可以使用log(counts+1)进行平滑
        log_counts = torch.log(counts)
        # 样本权重
        weight_factor = torch.clamp(torch.pow(counts / m_max, alpha), max=1.0) # clamp 将张量每个元素的范围限制到区间 [min,max]
        optimizer.zero_grad()
        # 计算batch内每个样本的L2损失
        loss = (torch.sum(word_embeds * context_embeds, dim=1) + word_biases + context_biases - log_counts) ** 2
        # 样本加权损失
        wavg_loss = (weight_factor * loss).mean()
        wavg_loss.backward()
        optimizer.step()
        total_loss += wavg_loss.item()
    print(f"Loss: {total_loss:.2f}")

# 合并词嵌入矩阵与上下文嵌入矩阵，作为最终的预训练词向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "glove.vec")

Dataset Construction:   0%|          | 0/54711 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 4331.81


Training Epoch 1:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 2996.89


Training Epoch 2:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 2130.97


Training Epoch 3:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 1539.97


Training Epoch 4:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 1128.91


Training Epoch 5:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 841.31


Training Epoch 6:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 638.89


Training Epoch 7:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 494.56


Training Epoch 8:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 388.29


Training Epoch 9:   0%|          | 0/1333 [00:00<?, ?it/s]

Loss: 306.14
Pretrained embeddings saved to: glove.vec


## 词义相关性

In [7]:
# 余弦相似度
def knn(W, x, k):
    # 计算查询向量x与矩阵W中每个行向量之间的余弦相似度
    # 返回相似度最高的k个向量
    similarities = torch.matmul(x, W.transpose(1, 0)) / (torch.norm(W, dim=1) * torch.norm(x) + 1e-9) # norm求范数，默认二范数
    knn = similarities.topk(k=k)
    return knn.values.tolist(), knn.indices.tolist()

# 近义词检索
def find_similar_words(embeds, vocab, query, k=5):
    # 查询词也位于词向量空间中，它与自己的相似度最高(1.0)
    # 取 k+1 个近邻
    knn_values, knn_indices = knn(embeds, embeds[vocab[query]], k + 1)
    knn_words = vocab.convert_ids_to_tokens(knn_indices)
    print(f">>> Query word: {query}")
    for i in range(k):
        print(f"cosine similarity={knn_values[i + 1]:.4f}: {knn_words[i + 1]}")

# 使用glove
word_sim_queries = ["china", "august", "good", "paris"]
vocab, embeds = load_pretrained("glove.vec")
for w in word_sim_queries:
    find_similar_words(embeds, vocab, w)

>>> Query word: china
cosine similarity=0.5030: recognized
cosine similarity=0.4882: remodelling
cosine similarity=0.4292: servotronics
cosine similarity=0.4205: euromarket
cosine similarity=0.4203: calmed
>>> Query word: august
cosine similarity=0.4885: trasaction
cosine similarity=0.4709: withdrfawal
cosine similarity=0.4534: fair
cosine similarity=0.4355: 23rd
cosine similarity=0.4251: maxtec
>>> Query word: good
cosine similarity=0.4810: corrientes
cosine similarity=0.4663: ames
cosine similarity=0.4612: peanut
cosine similarity=0.4582: scenes
cosine similarity=0.4505: pdvsa
>>> Query word: paris
cosine similarity=0.4307: au
cosine similarity=0.4296: inn
cosine similarity=0.4291: transcontinental
cosine similarity=0.4253: kakuei
cosine similarity=0.4238: dreyer


## 类比性

In [8]:
def find_analogy(embeds, vocab, word_a, word_b, word_c):
    vecs = embeds[vocab.convert_tokens_to_ids([word_a, word_b, word_c])]
    x = vecs[2] + vecs[1] - vecs[0]
    knn_values, knn_indices = knn(embeds, x, k=1)
    analogies = vocab.convert_ids_to_tokens(knn_indices)
    print(f">>> Query: {word_a}, {word_b}, {word_c}")
    print(f"{analogies}")

word_analogy_queries = [["brother", "sister", "man"],
                        ["paris", "france", "berlin"]]
vocab, embeds = load_pretrained("glove.vec")
for w_a, w_b, w_c in word_analogy_queries:
    find_analogy(embeds, vocab, w_a, w_b, w_c)

>>> Query: brother, sister, man
['sister']
>>> Query: paris, france, berlin
['berlin']


## Fine-tuning

In [9]:
def load_sentence_polarity():
    from nltk.corpus import sentence_polarity

    vocab = Vocab.build(sentence_polarity.sents())

    train_data = [(vocab.convert_tokens_to_ids(sentence), 0)
                  for sentence in sentence_polarity.sents(categories='pos')[:4000]] \
        + [(vocab.convert_tokens_to_ids(sentence), 1)
            for sentence in sentence_polarity.sents(categories='neg')[:4000]]

    test_data = [(vocab.convert_tokens_to_ids(sentence), 0)
                 for sentence in sentence_polarity.sents(categories='pos')[4000:]] \
        + [(vocab.convert_tokens_to_ids(sentence), 1)
            for sentence in sentence_polarity.sents(categories='neg')[4000:]]

    return train_data, test_data, vocab

In [16]:
class BowDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        return self.data[i]

def collate_fn(examples):
    inputs = [torch.tensor(ex[0]) for ex in examples]
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    offsets = [0] + [i.shape[0] for i in inputs]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    inputs = torch.cat(inputs)
    return inputs, offsets, targets

class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(MLP, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activate = F.relu
        self.linear2 = nn.Linear(hidden_dim, num_class)
    def forward(self, inputs, offsets):
        embedding = self.embedding(inputs, offsets)
        hidden = self.activate(self.linear1(embedding))
        outputs = self.linear2(hidden)
        log_probs = F.log_softmax(outputs, dim=1)
        return log_probs

# tqdm是一个Python模块，能以进度条的方式显示迭代的进度
from tqdm.auto import tqdm

# 超参数设置
embedding_dim = 128
hidden_dim = 256
num_class = 2
batch_size = 32
num_epoch = 10

# 加载数据
train_data, test_data, vocab = load_sentence_polarity()
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) # 将模型加载到CPU或GPU设备

#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, offsets, targets = [x.to(device) for x in batch]
        log_probs = model(inputs, offsets)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = model(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()

# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader):.2f}")

Training Epoch 0:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 166.03


Training Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 139.55


Training Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 103.97


Training Epoch 3:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 72.53


Training Epoch 4:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 46.50


Training Epoch 5:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 27.99


Training Epoch 6:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 15.25


Training Epoch 7:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 7.74


Training Epoch 8:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 3.91


Training Epoch 9:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 1.83


Testing:   0%|          | 0/2662 [00:00<?, ?it/s]

Acc: 0.73


In [17]:
class BowDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        return self.data[i]

def collate_fn(examples):
    inputs = [torch.tensor(ex[0]) for ex in examples]
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    offsets = [0] + [i.shape[0] for i in inputs]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    inputs = torch.cat(inputs)
    return inputs, offsets, targets

class MLP(nn.Module):
    def __init__(self, vocab, pt_vocab, pt_embeddings, hidden_dim, num_class):
        super(MLP, self).__init__()

        embedding_dim = pt_embeddings.shape[1] # 与预训练词向量维度保持一致
        vocab_size = len(vocab)
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim) # 词向量层
        self.embedding.weight.data.uniform_(-0.1, 0.1)

        for idx, token in enumerate(vocab.idx_to_token):
            pt_idx = pt_vocab[token]
            # 只初始化预训练词典中存在的词
            # 对于未出现在预训练词典中的词，保持其随机初始化数量
            if pt_idx != pt_vocab.unk:
                self.embedding.weight[idx].data.copy_(pt_embeddings[pt_idx])

        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activate = F.relu
        self.linear2 = nn.Linear(hidden_dim, num_class)
    def forward(self, inputs, offsets):
        embedding = self.embedding(inputs, offsets)
        hidden = self.activate(self.linear1(embedding))
        outputs = self.linear2(hidden)
        log_probs = F.log_softmax(outputs, dim=1)
        return log_probs

# tqdm是一个Python模块，能以进度条的方式显示迭代的进度
from tqdm.auto import tqdm

# 超参数设置
embedding_dim = 128
hidden_dim = 256
num_class = 2
batch_size = 32
num_epoch = 10

# 加载数据
train_data, test_data, vocab = load_sentence_polarity()
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pt_vocab, pt_embeddings = load_pretrained("glove.vec")
model = MLP(vocab, pt_vocab, pt_embeddings, hidden_dim, num_class)
model.to(device) # 将模型加载到CPU或GPU设备

#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, offsets, targets = [x.to(device) for x in batch]
        log_probs = model(inputs, offsets)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = model(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()

# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader):.2f}")

Training Epoch 0:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 169.17


Training Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 143.06


Training Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 111.22


Training Epoch 3:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 83.90


Training Epoch 4:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 61.05


Training Epoch 5:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 42.61


Training Epoch 6:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 28.06


Training Epoch 7:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 17.66


Training Epoch 8:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 10.21


Training Epoch 9:   0%|          | 0/250 [00:00<?, ?it/s]

Loss: 5.37


Testing:   0%|          | 0/2662 [00:00<?, ?it/s]

Acc: 0.73
