## CS310 Natural Language Processing
## Lab 4 (part 2): Data preparation for implementing word2vec

skipgram architecture and negative sampling method

In [55]:
from typing import List
from pprint import pprint
from utils import CorpusReader
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [56]:
# We set min_count=1 to include all words in the corpus
corpus = CorpusReader(inputFileName="lunyu_20chapters.txt", min_count=1)

Total vocabulary: 1352


In [57]:
print(corpus.word2id["子"])
print(corpus.id2word[1])
print(len(corpus.id2word))

1
子
1352


## Efficient way for negative sampling

In `utils.CorpusReader` class, we have implemented a method `initTableNegatives`. It creates a list of words (`self.negatives`) with a size of 1e8. This size is set a large value so that it scales up to very large corpus. 

The list contains the index of each word in the vocabulary, whose probability is proportional to the power of 0.75 of the word's original frequency count. 



In [58]:
# This is a simulation of how initTableNegatives works
# The impl. in utils.py is a bit different, but the idea is the same
word_frequency = {"a": 1, "b": 2, "c": 3, "d": 4}

# the scaled sum of frequencies Z = 1**0.75 + 2**0.75 + 3**0.75 + 4**0.75 = 7.7897270
# then the scaled probability of a = 1**0.75 / Z = 0.12837420128374202
# the scaled probability of b = 2**0.75 / Z = 0.21589881215898812
# the scaled probability of c = 3**0.75 / Z = 0.29262990292629903
# the scaled probability of d = 4**0.75 / Z = 0.3630970836309708

def initTableNegatives():
    pow_frequency = np.array(list(word_frequency.values())) ** 0.75 # 每个词的频率，并对其进行 0.75次方
    words_pow = sum(pow_frequency)
    ratio = pow_frequency / words_pow # 计算每个词的缩放概率
    count = np.round(ratio * CorpusReader.NEGATIVE_TABLE_SIZE) # 计算每个词在负样本表中的出现次数
    negatives = []
    for wid, c in enumerate(count): # wid 是每个词的索引，c 是词在负样本表中的出现次数
        negatives += [wid] * int(c) # 把索引按次数添加到negatives中
    negatives = np.array(negatives)
    np.random.shuffle(negatives)
    return negatives

negatives = initTableNegatives()

In [59]:
print(len(negatives))
print(set(negatives)) # the word indices: a -> 0, b -> 1, c -> 2, d -> 3
print(np.sum(negatives == 0) / len(negatives)) # should be the scaled probability of a
print(np.sum(negatives == 1) / len(negatives)) # should be the scaled probability of b
print(np.sum(negatives == 2) / len(negatives)) # should be the scaled probability of c
print(np.sum(negatives == 3) / len(negatives)) # should be the scaled probability of d

99999999
{0, 1, 2, 3}
0.12837420128374202
0.21589881215898812
0.29262990292629903
0.3630970836309708


Next, the `getNegatives` method returns the negative samples for a target word. The idea is to chop off a segment of given `size` from the `negatives` list. 

If the segment contains the target word, it is discarded and a new segment is taken. This is done to avoid the target word itself to be sampled as a negative.

In [60]:
# Test some examples
corpus.getNegatives(target=1, size=5)
# 从已经构建好的负样本表中，随机返回一定数量（由 size 参数指定）的负样本；
# 如果选定了目标词（target），丢弃这个段，重新选择一段

array([207, 344, 447,   0,  26])

## T1. Generate data for training

Now we are going to implement the sliding window to generate center, outside, and negative words for each position in a sentence.

- It takes a list of words as input and go through each word as a center word.
- For each center word, both the left and right `window_size` words are considered as outside words. This number is smaller near the two ends of the sentence.
- Call `corpus.getNegatives` to get negative samples for each center word.

In [61]:
def generate_data(words: List[str], window_size: int, k: int, corpus: CorpusReader):
    """ Generate the training data for word2vec skip-gram model
    Args:
        text: the input text
        window_size: the size of the context window
        k: the number of negative samples
        corpus: the corpus object, providing utilities such as word2id, getNegatives, etc.
    """
    ### START YOUR CODE ###
    word_ids = [corpus.word2id[word] for word in words] # 把word转换成id
    # 这个word_ids是一个列表
    
    for i,central_word_id in enumerate(word_ids):
        # 窗格的范围
        start=max(0,i-window_size)
        end=min(len(word_ids),i+window_size+1)# 在下面的循环中，这个不含

        outside_word_ids=[word_ids[j] for j in range(start,end) if i!=j]
        
        for outside_word_id in outside_word_ids:
            negative_samples=corpus.getNegatives(target=central_word_id,size=k)
            # 生成训练数据，格式为(center_word, outside_word, negative_samples)
            yield(central_word_id,outside_word_id,negative_samples)
    # Use for loop and yield
    ### END YOUR CODE ###

In [62]:
# Test generate_data
text = "学而时习之"
words = list(text)
print('words:', words)
print('word ids:', [corpus.word2id[word] for word in words])

# first center word is 学
print()
print(f'When window size is 3, for center word 学 -> {corpus.word2id["学"]}')
print(f'the outside words are: ')
print(f'而 -> {corpus.word2id["而"]}')
print(f'时 -> {corpus.word2id["时"]}')
print(f'习 -> {corpus.word2id["习"]}')

print()
print('output from generate_data:')
data = list(generate_data(list(text), window_size=3, k=5, corpus=corpus))
pprint(data[:3])


### You are expected to see the following output:
### Note that the negative samples are random, so you may see different numbers
# words: ['学', '而', '时', '习', '之']
# word ids: [46, 8, 224, 544, 5]

# When window size is 3, for center word 学 -> 46
# the outside words are: 
# 而 -> 8
# 时 -> 224
# 习 -> 544

# output from generate_data:
# [(46, 8, array([354,   3, 831, 570,  27])),
#  (46, 224, array([1077, 1095,   89,  340,   92])),
#  (46, 544, array([ 49, 488,   4, 269,  30]))]

words: ['学', '而', '时', '习', '之']
word ids: [46, 8, 224, 544, 5]

When window size is 3, for center word 学 -> 46
the outside words are: 
而 -> 8
时 -> 224
习 -> 544

output from generate_data:
[(46, 8, array([173,  17, 814, 185, 230])),
 (46, 224, array([  41,  273, 1165,  111,  457])),
 (46, 544, array([  0, 433, 444, 230,   2]))]


However, the above data are not in batch. We want all center words are batched into a tensor of dimension `batch_size`; same for the outside words and negative samples.

For example, in "学而时习之", if `batch_size` is 4, then the returned batch[0] will contain three tensors. 
- The first tensor contains center words, i.e., 3 "学" plus 1 "而" => [46, 46, 46, 8]
- The second tensor contains the correponding outside words, i.e., "而", "时", and "习" for "学"; "学" for "而" => [8, 224, 544,  46]
- The third tensor contains the negative samples, whose dimension is `batch_size` $\times$ `k`
  
The data type of the tensors is `torch.long`.

In [63]:
def batchify(data: List, batch_size: int):
    """ Group a stream into batches and yield them as torch tensors.
    Args:
        data: a list of tuples
        batch_size: the batch size 
    Yields:
        a tuple of three torch tensors: center, outside, negative
    """
    assert batch_size < len(data) # data should be long enough
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        if i > len(data) - batch_size: # if the last batch is smaller than batch_size, pad it with the first few data
            batch = batch + data[:i + batch_size - len(data)]
        
        ### START YOUR CODE ###
        center=[]
        outside=[]
        negative=[]

        for center_word,outside_word,neg_samples in batch:
            center.append(center_word)
            outside.append(outside_word)
            negative.append(neg_samples)

        center_tensor=torch.tensor(center,dtype=torch.long)
        outside_tensor=torch.tensor(outside,dtype=torch.long)
        negative_tensor=torch.tensor(negative,dtype=torch.long)

        yield(center_tensor,outside_tensor,negative_tensor)
        ### END YOUR CODE ###

In [64]:
# Test batchify

text = "学而时习之"
words = list(text)
data = list(generate_data(words, window_size=3, k=5, corpus=corpus))

batches = list(batchify(data, batch_size=4))
print(batches[0])


### You are expected to see the following output:
### Note that the negative samples are random, so you may see different numbers
# (tensor([46, 46, 46,  8]), tensor([  8, 224, 544,  46]), tensor([[  85,    3,   72,   26,   35],
#         [   7,    1,  487,   20,    4],
#         [  12,  227,    2,   25,  639],
#         [ 582,  148,   15, 1203,   85]]))

(tensor([46, 46, 46,  8]), tensor([  8, 224, 544,  46]), tensor([[  75,   44,  134,  267,   58],
        [ 137,   84,    0,  505,    4],
        [   2,  357,    1,  209,  559],
        [  49,   19,    3,   78, 1085]]))


## T2. Implement the SkipGram class

`SkipGram` is a subclass of `nn.Module`. The two key components are:
- `__init__`: initialize the embeddings
  - Two `nn.Embedding` objects are created: `self.emb_v` for center words; `self.emb_u` for outside words and negative samples.
  - Each `nn.Embedding` is created with `vocab_size` and `emb_dim` as input arguments. 
  - `self.emb_v` is initialized with uniform distribution; `self.emb_u` is initialized with zeros.
- `forward`: given input tensors, return the loss of the model
  - Takes three tensors as input: center words, outside words, and negative samples. They are the output from the previously defined `batchify` function.
  - Compute the loss using the formula: $-\log\sigma(v_c \cdot u_o) - \sum_{k=1}^K \log\sigma(-v_c \cdot u_k)$

*Hint*:
- For the $\log\sigma$ function, you can use `F.logsigmoid` in PyTorch. See the imported module: `import torch.nn.functional as F`
- If the input to `F.logsigmoid` is too large, it will return 0, which is not good for training. You can use `torch.clamp` to limit the input to a certain range. For example, `torch.clamp(x, min=-10, max=10)` will limit the input to be in the range of $[-10, 10]$.

SkipGram模型是一个用于训练词向量的经典模型，通常在Word2Vec算法中使用。其目标是通过给定的中心词预测周围词（即上下文词）。它的核心思想是，给定一个中心词，模型通过预测其上下文词来学习词嵌入（word embeddings）。

In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        # 创建中心词的嵌入层
        self.emb_v = nn.Embedding(vocab_size, emb_size, sparse=True)
        # 创建外部词和负样本的嵌入层
        self.emb_u = nn.Embedding(vocab_size, emb_size, sparse=True)

        initrange = 1.0 / self.emb_size # some experience passed down from generation to generation
        # self.emb_v使用均匀分布初始化。这意味着每个词的初始嵌入值是通过均匀分布随机生成的。
        nn.init.uniform_(self.emb_v.weight.data, -initrange, initrange) # same outcome as self.emb_v.weight.data.uniform_(-initrange, initrange)
        # self.emb_u则用零初始化。这是因为我们希望上下文词的嵌入在开始时并没有显著的权重影响
        nn.init.constant_(self.emb_u.weight.data, 0) # same outcome as self.emb_u.weight.data.zero_()

    def forward(self, center, outside, negative):
        """
        Args:
            center: the center word indices (B, ) B是batch size
            outside: the outside word indices (B, )
            negative: the negative word indices (B, k)
        """
        v_c = self.emb_v(center) # 维度：(batch_size, emb_dim)
        u_o = self.emb_u(outside)# 维度：(batch_size, emb_dim)
        u_n = self.emb_u(negative)
        
        ### START YOUR CODE ###
        # 计算正样本得分：中心词和外部词的点积
        positive_score=torch.sum(v_c*u_o,dim=1) # 逐元素相乘（即Hadamard乘积），依然是 (batch_size, emb_dim)
        #但是点乘完了，做的torch.sum(..., dim=1)：在第二个维度（即每个词向量的维度）上进行求和
        #positive_loss是每个样本的损失，维度为 (batch_size,)；也就是每个样本都有一个loss

        positive_score=torch.clamp(positive_score,min=-10,max=10)# 限制正样本得分
        postive_loss=-F.logsigmoid(positive_score)

        #计算负样本得分：中心词和负样本的点积
        #unsqueeze(1) 操作将 v_c 的维度从 (batch_size, emb_dim) 转换为 (batch_size, 1, emb_dim)；把v_c 转换成一个三维张量，方便与其他三维张量进行批量矩阵乘法
        #u_n.transpose(1, 2) 将 u_n 的维度从 (batch_size, K, emb_dim) 转置为 (batch_size, emb_dim, K)；从第二维和第三维交换
        #然后做的是批量矩阵乘法：(batch_size, 1, emb_dim) 的张量 v_c.unsqueeze(1) 与形状为 (batch_size, emb_dim, K) 的张量 u_n.transpose(1, 2) 相乘；结果是(batch_size, 1, K)
        negative_score = torch.bmm(v_c.unsqueeze(1), u_n.transpose(1, 2)).squeeze(1)# 最后的squeeze把维度为1的第二维去除，得到的结果维度是 (batch_size, K)
        negative_score=torch.clamp(negative_score,min=-10,max=10)
        negative_loss=-F.logsigmoid(-negative_score).sum(dim=1)
        loss = postive_loss+negative_loss
        ### END YOUR CODE ###

        return loss
    
    def save_embedding(self, id2word, file_name):
        embedding = self.emb_v.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_size))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [80]:
# Test the model
vacob_size =len(corpus.id2word)
emb_size = 32
model = SkipGram(vacob_size, emb_size)

weight = torch.empty(vacob_size, emb_size)
start_value = 0.01
for i in range(vacob_size):
    weight[i] = start_value + i * 0.01

model.emb_v.weight.data.copy_(weight)
model.emb_u.weight.data.copy_(weight)

# Test the model
center = torch.tensor([0, 1, 2, 3, 4])
outside = torch.tensor([0, 1, 2, 3, 4])
negative = torch.tensor([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])
with torch.no_grad():
    loss = model(center, outside, negative)
print(loss)


### You are expected to see the following output:
# tensor([4.1814, 4.2008, 4.2172, 4.2307, 4.2414])

tensor([4.1814, 4.2008, 4.2172, 4.2307, 4.2414])
