In [10]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [11]:
#准备预料
corpus = ['he is a king',
          'she is a queen',
          'he is a man',
          'she is a woman',
          'warsaw is poland capital',
          'berlin is germany capital',
          'paris is france capital']
corpus_list = [sentence.split() for sentence in corpus]

In [22]:
print(corpus_list
    )

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital']]


In [23]:
#构建词典
word2ix = {}
for sentence in corpus:
    for word in sentence.split():
        if word not in word2ix:
            word2ix[word] = len(word2ix) #为每个词匹配一个索引index
print(word2ix)
        
ix2word = {v:k for k,v in word2ix.items()}#将dict中的key与value互换位置
voc_size = len(word2ix)    


{'he': 0, 'is': 1, 'a': 2, 'king': 3, 'she': 4, 'queen': 5, 'man': 6, 'woman': 7, 'warsaw': 8, 'poland': 9, 'capital': 10, 'berlin': 11, 'germany': 12, 'paris': 13, 'france': 14}


In [31]:
#构建训练对
WINDOWS = 2 # 取左右窗口的词作为context_word
pairs = [] # 存放训练对

for sentence in corpus_list:
    for center_word_index in range(len(sentence)):
        center_word_ix = word2ix[sentence[center_word_index]]
        print(sentence[center_word_index])
        print('center_word_index:',center_word_index)
        print('center_word_ix:',center_word_ix)
        for win in range(-WINDOWS,WINDOWS+1):
            contenx_word_index = center_word_index + win
            if 0<=contenx_word_index<=len(sentence)-1 and contenx_word_index != center_word_index:
                contenx_word_ix = word2ix[sentence[contenx_word_index]]
                pairs.append((center_word_ix,contenx_word_ix))
print(pairs)

he
center_word_index: 0
center_word_ix: 0
is
center_word_index: 1
center_word_ix: 1
a
center_word_index: 2
center_word_ix: 2
king
center_word_index: 3
center_word_ix: 3
she
center_word_index: 0
center_word_ix: 4
is
center_word_index: 1
center_word_ix: 1
a
center_word_index: 2
center_word_ix: 2
queen
center_word_index: 3
center_word_ix: 5
he
center_word_index: 0
center_word_ix: 0
is
center_word_index: 1
center_word_ix: 1
a
center_word_index: 2
center_word_ix: 2
man
center_word_index: 3
center_word_ix: 6
she
center_word_index: 0
center_word_ix: 4
is
center_word_index: 1
center_word_ix: 1
a
center_word_index: 2
center_word_ix: 2
woman
center_word_index: 3
center_word_ix: 7
warsaw
center_word_index: 0
center_word_ix: 8
is
center_word_index: 1
center_word_ix: 1
poland
center_word_index: 2
center_word_ix: 9
capital
center_word_index: 3
center_word_ix: 10
berlin
center_word_index: 0
center_word_ix: 11
is
center_word_index: 1
center_word_ix: 1
germany
center_word_index: 2
center_word_ix: 12
ca

In [17]:
class SkipGram(nn.Module):
    def __init__(self, voc_dim, emb_dim):
        super(SkipGram, self).__init__()
        # 初始化参数
        self.embedding_matrix = nn.Parameter(torch.FloatTensor(emb_dim, voc_dim))
        self.W = nn.Parameter(torch.FloatTensor(voc_dim, emb_dim))
        torch.nn.init.xavier_normal(self.embedding_matrix)
        torch.nn.init.xavier_normal(self.W)

    def forward(self, x):
        emb = torch.matmul(self.embedding_matrix, x)
        h = torch.matmul(self.W, emb)  # [voc_dim]
        log_softmax = F. log_softmax(h)  # [voc_dim]

        return log_softmax


In [18]:
# 提前设置超参数
epoch = 10
lr = 1e-2
embedding_dim = 5

# 模型、优化器、损失
model = SkipGram(voc_size, embedding_dim)
optim = torch.optim.Adam(model.parameters(), lr=lr)
loss_f = torch.nn.NLLLoss()  

# 这是将索引变成词典大小的One-Hot向量的方法
def get_onehot_vector(ix):
    one_hot_vec = torch.zeros(voc_size).float()
    one_hot_vec[ix] = 1.0
    return one_hot_vec

# 迭代
for e in range(epoch):
    epoch_loss = 0

    for i, (center_ix, context_ix) in enumerate(pairs):
        optim.zero_grad()

        # 预处理好数据结构
        one_hot_vec = get_onehot_vector(center_ix)
        y_true = torch.Tensor([context_ix]).long()

        # 前向
        y_pred = model(one_hot_vec)
        loss = loss_f(y_pred.view(1, -1), y_true)

        # 后向
        loss.backward()
        epoch_loss += loss.data.item()

        # 梯度更新
        optim.step()

    if e % 2 == 0:
        print('epoch: %d, loss: %f' % (e, epoch_loss))
  


  import sys
  
  del sys.path[0]


epoch: 0, loss: 193.154262
epoch: 2, loss: 178.086491
epoch: 4, loss: 156.701561
epoch: 6, loss: 141.661043
epoch: 8, loss: 133.716635


In [42]:
# # 3.预测：预测单词的向量并计算相似度
v1 = torch.matmul(model.embedding_matrix, get_onehot_vector((word2ix['he'])))
v2 = torch.matmul(model.embedding_matrix, get_onehot_vector((word2ix['she'])))
v3 = torch.matmul(model.embedding_matrix, get_onehot_vector((word2ix['capital'])))

print(v1)
print(v2)
print(v3)

s_v1_v2 = F.cosine_similarity(v1, v2, dim=0)
s_v1_v3 = F.cosine_similarity(v1, v3, dim=0)
print(s_v1_v2)
print(s_v1_v3)
for i in word2ix:
    v = torch.matmul(model.embedding_matrix, get_onehot_vector((word2ix['he'])))
    print('词：%f,向量：%f'.format( word2ix[i],v))

tensor([-1.5483,  0.7743, -0.1095,  1.4099, -0.0963], grad_fn=<MvBackward>)
tensor([-0.8205,  0.5341,  0.0617,  1.7159, -0.2900], grad_fn=<MvBackward>)
tensor([ 0.1909, -0.1515,  1.2479,  0.3793,  1.1203], grad_fn=<MvBackward>)
tensor(0.9228, grad_fn=<DivBackward0>)
tensor(-0.0316, grad_fn=<DivBackward0>)
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
词：%f,向量：%f
