In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optimizer
import torch.utils.data as Data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.FloatTensor

In [4]:
sentences = ["jack like dog", "jack like cat", "jack like animal",
  "dog cat animal", "banana apple cat dog like", "dog fish milk like",
  "dog cat animal like", "jack like apple", "apple like", "jack like banana",
  "apple banana jack movie book music like", "cat dog hate", "cat dog like"]

sentences_list = " ".join(sentences).split()  # ['jack', 'like', 'dog']
vocab = list(set(sentences_list))  #  构建词汇表
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(vocab)

In [5]:
# model parameter
window_size = 2
batch_size = 8
m = 2  #  每个词用2维表示

In [6]:
skip_grams = []
for idx in range(window_size, len(sentences_list) - window_size):
    center = word2idx[sentences_list[idx]]
    word_background_list = sentences_list[idx - window_size: idx] + sentences_list[idx + 1: idx + window_size + 1]
    context = [word2idx[background_word] for background_word in word_background_list]

    for w in context:
        skip_grams.append([center, w])

In [8]:
def make_data(skip_grams):
    input_data = []
    output_data = []
    for a, b in skip_grams:
        input_data.append(np.eye(vocab_size)[a])
        output_data.append(b)
    return input_data, output_data

In [16]:
input_data, output_data = make_data(skip_grams)
input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
dataset = Data.TensorDataset(input_data, output_data)
loader = Data.DataLoader(dataset, batch_size, True)

In [17]:
class Word2Vec(nn.Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.W = nn.Parameter(torch.randn(vocab_size, m).type(dtype))
        self.V = nn.Parameter(torch.randn(m, vocab_size).type(dtype))

    def forward(self, X):
        #  X : [batch_size, vocab_size]
        hidden = torch.mm(X, self.W) #  [batch_size, m]
        output = torch.mm(hidden, self.V) #  [batch_size, vocab_size]
        return output

In [18]:
model = Word2Vec().to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optim = optimizer.Adam(model.parameters(), lr=1e-3)

In [19]:
for epoch in range(2000):
    for i, (batch_x, batch_y) in enumerate(loader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)

        if (epoch + 1) % 1000 == 0:
            print(epoch + 1, i, loss.item())
        
        optim.zero_grad()
        loss.backward()
        optim.step()   


1000 0 1.9726399183273315
1000 1 2.087526798248291
1000 2 2.381875514984131
1000 3 2.397819757461548
1000 4 1.9717426300048828
1000 5 2.1237902641296387
1000 6 1.9658904075622559
1000 7 2.118483543395996
1000 8 2.0691521167755127
1000 9 1.875338077545166
1000 10 2.274322271347046
1000 11 2.147645950317383
1000 12 2.271571397781372
1000 13 1.7344664335250854
1000 14 2.119657039642334
1000 15 1.4634943008422852
1000 16 2.2701416015625
1000 17 2.069187641143799
1000 18 2.525500774383545
1000 19 1.65653657913208
1000 20 2.248680830001831
2000 0 1.8978071212768555
2000 1 2.14563250541687
2000 2 2.042457342147827
2000 3 1.661184549331665
2000 4 2.083333730697632
2000 5 2.1225814819335938
2000 6 1.989145278930664
2000 7 2.045304298400879
2000 8 1.9921973943710327
2000 9 2.3354737758636475
2000 10 2.1458075046539307
2000 11 2.134782552719116
2000 12 2.226255178451538
2000 13 2.664048433303833
2000 14 2.0355443954467773
2000 15 2.0177648067474365
2000 16 2.258944034576416
2000 17 1.787781834602