In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
USE_CUDA = torch.cuda.is_available()

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)

# 设定hyper parameters
C = 3  # context window
K = 100  # number of negative samples, 每出现一个正确的词就要出现100个错误的词
NUM_EPOCHS = 2
MAX_VOCAB_SIZE = 30000
BATCH_SIZE = 128
LEARNING_RATE = 0.2
EMBEDDING_SIZE = 100

In [4]:
torch.__version__

'1.0.1'

# Preprocessing

In [5]:
def word_tokenize(text):
    return text.split()

- 从文本中读取所有的文字，通过这些文本创建一个vocabulary；
- 由于单词数量可能很大，我们选取最常见的MAX_VOCAB_SIZE个单词；
- 我们添加一个UNK单词表示所有不常见的单词；
- 我们需要记录单词到index的mapping，index到单词的mapping，单词的count，单词的(normalized) frequency以及单词总数。

In [6]:
with open(file='./text8/text8.train.txt', mode='r') as fin:
    text = fin.read()

In [7]:
# 从文本中读取所有的文字，分词

In [8]:
text = [w for w in word_tokenize(text=text)]

In [7]:
# 通过这些文本创建一个vocabulary

In [8]:
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))

In [9]:
vocab['<unk>'] = len(text) - np.sum(list(vocab.values()))

In [10]:
# 构建词汇表

In [11]:
# 构建词汇的mapping

In [12]:
idx_to_word = [word for word in vocab.keys()]  # 取出所有单词表里的单词

In [13]:
idx_to_word[:10]

['the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero', 'nine', 'two']

In [14]:
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}

In [15]:
list(word_to_idx.items())[:10]

[('the', 0),
 ('of', 1),
 ('and', 2),
 ('one', 3),
 ('in', 4),
 ('a', 5),
 ('to', 6),
 ('zero', 7),
 ('nine', 8),
 ('two', 9)]

In [16]:
# 计算每一个单词的frequency

In [17]:
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)

In [18]:
word_freqs = word_counts / np.sum(word_counts)

In [19]:
word_freqs = word_freqs ** (3./4.)
word_freqs = word_counts / np.sum(word_counts)  # 用来做negative sampling

In [20]:
# 查看预处理之后的词汇数是否有MAX_VOCAB_SIZE那么多

In [21]:
VOCAB_SIZE = len(idx_to_word); VOCAB_SIZE

30000

# Dataloader

当我们在训练的时候，我们需要一个batch一个batch的数据，我们可以使用Pytorch实现一个Dataloader，需要以下内容：

- 把所有text编码成数字，然后用subsampling预处理这些文字；
- 保存vocabulary，单词count，normalized word frequency；
- 每个iteration sample一个中心词；
- 根据当前的中心词返回context单词；
- 根据当前中心词sample一些negative单词；
- 返回单词的counts

为了使用Dataloader，我们需要定义一下两个function：

- ```__len__``` function需要返回整个数据集中有多少个item；
- ```__get__``` 根据给定的index返回一个item

    
有了Dataloader之后，我们可以轻松随机打乱整个数据集，拿到一个batch的数据等等。

In [22]:
class WordEmbeddingDataset(tud.Dataset):  # 继承tud.Dataset父类
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordEmbeddingDataset, self).__init__()  # 初始化模型
        self.text_encoded = [word_to_idx.get(word, word_to_idx['<unk>']) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx  # 保存数据
        self.idx_to_word = idx_to_word  # 保存数据
        self.word_freqs = torch.Tensor(word_freqs)  # 保存数据
        self.word_counts = torch.Tensor(word_counts)  # 保存数据
    
    def __len__(self):  # 数据集有多少个item
        return len(self.text_encoded)

    def __getitem__(self, idx):  # 魔法函数__getitem__为迭代器，返回以下数据用于训练
        """
            - 中心词center_word
            - 这个单词附近的(positive)单词
            - 随机采样的K个单词作为negative sample
        """
        center_word = self.text_encoded[idx]  # idx代表了所有单词索引
        pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1))  # 周围词索引
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices]
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], replacement=True)  # 负例采样单词索引，torch.multinomial()作用是对self.word_freqs做K*pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
        return center_word, pos_words, neg_words

In [23]:
# 创建Dataset和Dataloader

In [29]:
dataset = WordEmbeddingDataset(
    text=text,
    word_to_idx=word_to_idx,
    idx_to_word=idx_to_word,
    word_freqs=word_freqs,
    word_counts=word_counts
)
dataloader = tud.DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)

In [30]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x1bcf466aeb8>

In [32]:
next(iter(dataloader))

[tensor([  444,  1228,   275, 24695, 29999,    51,     0,     2,    24,    16,
             9,  2120,  2128, 29999,    31,   544,     0,   359,  2225,   641,
            10,  5100,   106,  9018,    14,     0, 20506,  2728,  1015,    33,
          9262,   201,  7064,    80,  1210, 18818,  2131,  3345,    26,  7813,
           980,   257,    19,    24,  5172,  3779,    64,     1,     5,     1,
          1512,   126,    26,    10,     6,   144, 29999,     5,    68,    27,
            20,     0,  3241,   564,   699,     4,    71, 13595,   296, 29999,
           257,  3921,  2047, 29999,     0,  6222,     3,    19,  6167,     4,
            17,     1,   238,  2388,     1, 21971,   372, 29999,   155,  3444,
            26,    14,    71,   452,   359,     9,     5, 29999,  3259,   409,
           610,     5,    49,    71,    35,     3,  5816, 29999,    94,  2243,
            73,  4319,    36,    10,    42,   136,  1500,  2725,   185,     5,
           764,   238,  4301,  8969, 29999,  1009,  

# 定义PyTorch模型

In [12]:
# 定义PyTorch模型
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size

        init_range = 0.5 / self.embed_size
        self.in_embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=embed_size, sparse=False)
        self.in_embed.weight.data.uniform_(-init_range, init_range)
        self.out_embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=embed_size)
        self.out_embed.weight.data.uniform_(-init_range, init_range)

    def forward(self, input_labels, pos_labels, neg_lables):  # loss function
        """
        :param input_labels: [batch_size]
        :param pos_labels: [batch_size, (window_size * 2)]
        :param neg_lables: [batch_size, (window_size * 2 * K)]
        :return: loss, [batch_size]
        """
        batch_size = input_labels.size(0)

        input_embedding = self.in_embed(input_labels)  # [batch_size, embed_size]
        pos_embedding = self.out_embed(pos_labels)  # [batch_size, (window_size * 2), embed_size]
        neg_embedding = self.out_embed(neg_lables)  # [batch_size, (window_size * 2 * K), embed_size]

        # unsqueeze()升维, squeeze()降维
        input_embedding = input_embedding.unsqueeze(2)  # [batch_size, embed_size, 1], 第二个维度加1
        pos_dot = torch.bmm(pos_embedding, input_embedding).squeeze()  # [batch_size, (window_size * 2)]
        neg_dot = torch.bmm(neg_embedding, -input_embedding).squeeze()  # [batch_size, (window_size * 2 * K)]

        log_pos = F.logsigmoid(pos_dot).sum(1)
        log_neg = F.logsigmoid(neg_dot).sum(1)
        loss = log_pos + log_neg

        return -loss

    def input_embedding(self):  # 取出self.in_embed数据参数
        return self.in_embed.weight.data.cpu().numpy()

In [None]:
# 定义一个模型以及把模型移动到GPU
model = EmbeddingModel(vocab_size=VOCAB_SIZE, embed_size=EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()


# 评估模型
def evaluate(filename, embedding_weights):
    if filename.endswith('.csv'):
        data = pd.read_csv(filename, sep=',')
    else:
        data = pd.read_csv(filename, sep='\t')
    human_similarity = []
    model_similarity = []
    for i in data.iloc[:, 0:2].index:
        word1, word2 = data.iloc[i, 0], data.iloc[i, 1]
        if word1 not in word_to_idx or word2 not in word_to_idx:
            continue
        else:
            word1_idx, word2_idx = word_to_idx[word1], word_to_idx[word2]
            word1_embed, word2_embed = embedding_weights[[word1_idx]], embedding_weights[[word2_idx]]
            model_similarity.append(float(cosine_similarity(word1_embed, word2_embed)))
            human_similarity.append(float(data.iloc[i, 2]))
    return scipy.stats.spearmanr(human_similarity, model_similarity)


def find_nearest(word):
    index = word_to_idx[word]
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
    return [idx_to_word[i] for i in cos_dis.argsort()[:10]]


# 训练模型
"""
1.模型一般需要训练若干个epoch，每个epoch我们都把所有数据分成若干个batch，把每个batch的输入和输出都包装成cuda tensor；
2.forward pass，通过输入的句子预测每个单词的下一个单词，用模型的预测和正确的下一个单词计算cross entropy loss；
3.清空模型当前的Gradient；
4.backward pass，更新模型参数；
5.每隔一定的iteration，输出模型在当前iteration的loss以及在验证数据集上做模型的评估。
"""
optimizer = optim.SGD(params=model.parameters(), lr=LEARNING_RATE)
for e in range(NUM_EPOCHS):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataLoader):
        # print(input_labels, pos_labels, neg_labels)
        # if i > 2:
        #     break
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        if USE_CUDA:
            input_labels = input_labels.cuda()
            pos_labels = pos_labels.cuda()
            neg_labels = neg_labels.cuda()

            optimizer.zero_grad()
            loss = model(input_labels, pos_labels, neg_labels).mean()  # 传入参数给forward()函数
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                with open(file=LOG_FILE, mode='a', encoding='UTF-8') as f_out:
                    f_out.write('Epoch: {}, Iteration: {}, Loss: {} + \n'.format(e, i, loss.item()))
                    print(f'Epoch: {e}, Iteration: {i}, Loss: {loss.item()}')

            if i % 2000 == 0:
                embedding_weights = model.input_embedding()
                sim_simlex = evaluate(filename='simlex-999.txt', embedding_weights=embedding_weights)
                sim_men = evaluate(filename='men.txt', embedding_weights=embedding_weights)
                sim_353 = evaluate(filename='wordsim353.csv', embedding_weights=embedding_weights)
                with open(file=LOG_FILE, mode='a') as f_out:
                    print(f'Epoch: {e}, Iteration: {i}, sim_simlex: {sim_simlex}, sim_men: {sim_men}, sim_353: {sim_353}, nearest to monster: {find_nearest(word="monster")} + \n')
                    f_out.write('Epoch: {}, Iteration: {}, sim_simlex: {}, sim_men: {}, sim_353: {}, nearest to monster: {} + \n'.format(
                        e, i, sim_simlex, sim_men, sim_353, find_nearest(word="monster")))

    embedding_weights = model.input_embedding()
    np.save('embedding-{}'.format(EMBEDDING_SIZE), embedding_weights)
    torch.save(model.state_dict(), 'embedding-{}.th'.format(EMBEDDING_SIZE))