In [7]:
import torch
import torch.nn as nn
import torch.optim as optim


In [18]:
class Embedding(nn.Module):
    def __init__(self, input_dim, emb_dim):
        """
        初始化 Embedding 层。
        :param input_dim: 输入特征的维度。
        :param emb_dim: 嵌入的维度。
        """
        super(Embedding, self).__init__()
        self.input_matrix = nn.Parameter(torch.randn(input_dim,emb_dim)) #输入特征的嵌入矩阵
        self.output_matrix = nn.Parameter(torch.randn(emb_dim,input_dim)) #输出特征的嵌入矩阵


    def forward(self,x):
        """
        前向传播函数。
        :param x: 输入张量，形状为 (batch_size, input_dim)。
        :return: 返回样本与整个词汇表的点积，形状为 (batch_size, input_dim)。
        """
        embedding = torch.matmul(x,self.input_matrix) #获取输入特征的嵌入表示
        product = torch.matmul(embedding, self.output_matrix) #计算输入特征与整个词汇表的点积
        return torch.softmax(product,-1) #对点积进行softmax归一化 (batch_size, input_dim)

        

# 定义超参数
vocab_size = 3  # 词汇表大小
embedding_dim = 2  # 嵌入维度
batch_size = 2  # 批量大小
seq_len = 3  # 序列长度
epochs = 10  # 训练轮数

# 实例化模型
model = Embedding(input_dim=vocab_size, emb_dim=embedding_dim)
model(torch.tensor([1,0,0],dtype= torch.float))



tensor([0.4133, 0.1791, 0.4077], grad_fn=<SoftmaxBackward0>)

In [76]:
import collections
import numpy as np

def subsample(words, freq_threshold=1e-2):
    """
    子采样函数
    :param words: 单词索引列表
    :param freq_threshold: 频率阈值
    :return: 子采样后的单词索引列表
    """
    # 使用 collections.Counter 统计每个单词的频率
    word_counts = collections.Counter(words)
    total_words = len(words)
    
    # 计算每个单词的频率
    word_freq = {word: count / total_words for word, count in word_counts.items()}
    
    # 计算丢弃概率
    drop_prob = {word: 1 - np.sqrt(freq_threshold / freq) for word, freq in word_freq.items()}
    
    # 子采样
    subsampled_words = [word for word in words if np.random.random() >  drop_prob[word]]
    print(drop_prob)
    
    return subsampled_words

# 示例：子采样
words = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]  # 假设的单词索引列表
subsampled_words = subsample(words)
print("Original words:", words)
print("Subsampled words:", subsampled_words)

{1: 0.6127016653792583, 2: 0.726138721247417, 3: 0.7763932022500211, 4: 0.8063508326896291, 5: 0.8267949192431123}
Original words: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]
Subsampled words: [2, 4, 4, 5]


In [85]:
def get_unigram_distribution(words,power):
    """
    子采样函数
    :param words: 单词索引列表
    :param power: 次方
    :return: 采样分布
    """
    # 使用 collections.Counter 统计每个单词的频率
    word_counts = collections.Counter(words)
    total_words = len(words)
    
    # 计算每个单词的频率
    word_freq_power = {word: np.power(count / total_words, power) for word, count in word_counts.items()}
    word_sum = sum(word_freq_power.values())

    # 计算每个单词的概率
    word_prob = {word: freq_power / word_sum for word, freq_power in word_freq_power.items()}

    return word_prob

def get_negative_samples(word_prob, k = 5):
    """
    从采样分布中随机抽取负样本
    :param word_prob: 采样分布, 字典形式, key为单词索引, value为概率
    :param words: 单词索引列表
    :param num_samples: 每个单词的负样本数量
    :return: 负样本列表
    """
    
    # 从采样分布中随机抽取负样本
    word = list(word_prob.keys())
    prob = list(word_prob.values())

    print(word,prob)
    negative_samples =  np.random.choice(word, size=k, p=prob)
    return negative_samples

words = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]  # 假设的单词索引列表
power = 0.75  # 次方

# 计算 unigram 分布
word_prob = get_unigram_distribution(words, power)

# 采样
num_samples = 3  # 需要采样的单词数量
samples = get_negative_samples(word_prob, num_samples)

print("Original words:", words)
print("Unigram distribution:", word_prob)
print("Sampled words:", samples)

[1, 2, 3, 4, 5] [0.08981959121294449, 0.1510579445410381, 0.2047443920227003, 0.2540481681203067, 0.30032990410301025]
Original words: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]
Unigram distribution: {1: 0.08981959121294449, 2: 0.1510579445410381, 3: 0.2047443920227003, 4: 0.2540481681203067, 5: 0.30032990410301025}
Sampled words: [4 3 5]
