In [3]:
import torch as to

print(to.__version__)

1.8.1


# pytorch 搭建skip-gram模型进行embedding转换

- 目的：词汇embedding转换
- 输入：输入文本(text)
- 输出：embedding_lookup矩阵(model.state_dict()["in_embed.weight"])
- 主要步骤：
1. 文本预处理
2. 获取批次训练数据
3. 构建模型及损失器
4. 模型训练
5. 词向量可视化

## 0.全局变量设置

In [None]:
EMBEDDING_DIM = 2  # 词向量维度
PRINT_EVERY = 1000  # 每训练多少步可视化一次
EPOCHS = 1000  # 训练的轮数
BATCH_SIZE = 5  # 每一批训练数据中输入词的个数
N_SAMPLES = 3  # 负样本大小
WINDOW_SIZE = 5  # 周边词窗口大小
FREQ = 0  # 去除低频词的阈值
Dropout_WORDS = False  # 是否进行高频词抽样处理

## 1.文本预处理

In [None]:
# 导入输入文本
text = '"i like dog i like cat i like animal dog cat animal apple cat dog like dog fish milk like dog \
cat eyes like i like apple apple i hate apple i movie book music like cat dog hate cat dog like"'

In [None]:
from collections import Counter
# 文本预处理
def preprocess(text, FREQ):
    words = text.lower().split() # 转小写单词
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > FREQ]
    return trimmed_words

In [None]:
# 建立单词和索引的一一映射
def word_id_mappind(words):
    word2id, id2word = {}, {}
    for word in words:
        if word not in word2id:
            id = id2word
            word2id[word] = id
            id2word[id] = word
    id_words= [word2id[word] for word in words]
    return id_words

In [None]:
# 计算单词的频次及负采样概率（抽样及负采样需要）
import numpy as np
import torch
def figure_freq(id_words):
    cnt_words = Counter(id_words)
    cnt_total = len(cnt_words)
    word_freqs = {word: cnt/cnt_total for word, cnt in cnt_words.items()}

    word_probs = np.array((list(word_freqs.values())))
    noise_probs = torch.from_numpy(word_probs**0.75 / np.sum(word_freqs**0.75))

    return word_freqs, noise_probs

In [4]:
# 丢弃部分单词
import random
def dropout_words(id_words):
    if Dropout_WORDS:
        t = 1e-5
        drop_probs = {word: 1-np.sqrt(t/word_freqs[w]) for word in id_words}
        train_words = [word for word in id_words if random.random()>drop_probs[word]]
    else:
        train_words = id_words
    return train_words

NameError: name 'Dropout_WORDS' is not defined

In [None]:
# 获取指定下标的输入词的周边词
def get_targets(words, idx, window_size):
    start_idx = idx - window_size if (idx-win_size) > 0 else 0
    end_idx = idx + window_size
    targets = words[start_idx:idx] + words[idx+1:end_idx]
    return targets

In [None]:
# 获取批次（输入-输出）词对
def get_batch(words, batch_size, window_size):
    batchs = len(words) // batch_size
    words = words[:batchs*batch_size]

    for idx in range(0, len(words), batch_size):
        batch_X, batch_y = [], []
        cur_batch = words[idx:idx+batch_size]
        for jdx in range(batch_size):
            x = cur_batch[jdx]
            y = get_targets(batch, jdx, window_size)
            # 一个输入对多个输出，为了使输入输出一致，复制
            batch_X.extend([x]*len(y))
            batch_y.extend(y)
        yield batch_X, batch_y