word2vec改进①

Embedding层
从权重参数中抽取单词ID对应行（向量）的层，Embedding层里存放词嵌入

In [1]:
#从数组中提取一行和多行
import numpy as np
W=np.arange(21).reshape(7,3)
print(W)
print(W[2])
idx=np.array([1,0,3,0])
print(W[idx])

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]
 [15 16 17]
 [18 19 20]]
[6 7 8]
[[ 3  4  5]
 [ 0  1  2]
 [ 9 10 11]
 [ 0  1  2]]


In [2]:
#基于numpy的随机采样方法
import numpy as np
#从0-9的数字中随机选一个数字
print(np.random.choice(10))

#words列表中随机选一个元素
words=['you','say','goodbye','I','hello','.']
print(np.random.choice(words))

#有放回采样5次
print(np.random.choice(words,size=5))

#无放回采样5次
print(np.random.choice(words,size=5,replace=False))

#基于概率分布进行采样
p=[0.5,0.1,0.05,0.2,0.05,0.1]
print(np.random.choice(words,p=p))

6
hello
['.' 'say' 'you' 'goodbye' 'goodbye']
['I' '.' 'goodbye' 'say' 'you']
you


In [3]:
#实现Embedding层的forward
class Embedding:
    def __init__(self,W):
        self.params=[W]
        self.grads=[np.zeros_like(W)]
        self.idx=None   #以数组形式保存需要提取的行的索引
    
    def forward(self,idx):   #正向传播只是从权重矩阵提取特定行，并将该行神经元原样传给下一层
        W,=self.params
        self.idx=idx
        out=W[idx]
        return out
    
    def backward(self,dout):
        dW,=self.grads
        dW[...]=0
        for i,word_id in enumerate(self.idx): #对于重复的索引，采用将梯度累加的形式
            dW[word_id]+=dout[i]
        #或者可以表示为 np.add.at(dW,self.idx,dout)
        #np.add.at(A,idx,B)将B加到A上，通过idx指定A中需要进行加法的行
        #dW[self.idx]=dout  #将上一层传来的梯度dout写入idx指定行
        return None

word2vec改进②

用负采样Negative Sampling替代Softmax，使中间层之后的处理始终保持低/恒定计算量

In [4]:
#对正例的学习
class EmbeddingDot:
    def __init__(self,W):
        self.embed=Embedding(W)#保存Embedding层
        self.params=self.embed.params
        self.grads=self.embed.grads
        self.cache=None  #保存正向传播时的计算结果
    
    def forward(self,h,idx): #idx表示mini-batch一并处理几笔数据，即W的第几行；    中间层神经元为h
        target_W=self.embed.forward(idx)
        out=np.sum(target_W*h,axis=1)
        self.cache=(h,target_W)
        return out
    
    def backward(self,dout):
        h,target_W=self.cache
        dout=dout.reshape(dout.shape[0],1)
        dtarget_W=dout*h
        self.embed.backward(dtarget_W)
        dh=dout*target_W
        return dh

In [5]:
from negative_sampling_layer import UnigramSampler

In [13]:
#对负例的学习——负采样
class NegativeSamplingLoss:
    def __init__(self,W,corpus,power=0.75,sample_size=5):
        self.sample_size=sample_size
        self.sampler=UnigramSampler(corpus,power,sample_size)
        self.loss_layers=[SigmoidWithLoss() for _ in range(sample_size+1)] #生成一个正例用的层&sample_size个负例用的层
        self.embed_dot_layers=[EmbeddingDot(W) for _ in range(sample_size+1)]
        self.params,self.grads=[],[]
        for layer in self.embed_dot_layers:
            self.params+=layer.params
            self.grads+=layer.grads
        
    def forward(self,h,target):  #接收的参数是中间层的神经元h和正例目标词target
        batch_size=target.shape[0]
        negative_sample=self.sampler.get_negative_sample(target) #采样负例
        
        #正例的正向传播
        score=self.embed_dot_layers[0].forward(h,target)
        correct_label=np.ones(batch_size,dtype=np.int32)  #返回batch_size类型的新数组，且元素为1
        loss=self.loss_layers[0].forward(score,correct_label) #正例正向传播的损失函数
        
        #负例的正向传播
        negative_label=np.zeros(batch_size,dtype=np.int32)  #返回batch_size类型的新数组，且元素为0
        for i in range(self.sample_size):
            negative_target=negative_sample[:,i]
            score=self.embed_dot_layers[1+i].forward(h,negative_target)
            loss+=self.loss_layers[1+i].forward(score,negative_label)  #最终的损失=1个正例loss+其他负例Loss
            
        return loss
    
    def backward(self,dout=1):
        dh=0
        for l0,l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore=l0.backward(dout)
            dh+=l1.backward(dscore)  #反向传播时将多个梯度累加起来
        return dh

改进版word2vec的学习

In [14]:
class CBOW:  #在改进后的CBOW类中输出侧的权重和输入侧的权重形状相同
    def __init__(self,vocab_size,hidden_size,window_size,corpus):
        V,H=vocab_size,hidden_size
        
        #初始化权重
        W_in=0.01* np.random.randn(V,H).astype('f')
        W_out=0.01* np.random.randn(V,H).astype('f')
        
        #生成层
        self.in_layers=[]
        for i in range(2*window_size):  #创建2*window_size个Embedding层
            layer=Embedding(W_in)  #使用Embedding层
            self.in_layers.append(layer)
        self.ns_loss=NegativeSamplingLoss(W_out,corpus,power=0.75,sample_size=5)  #创建负采样损失函数层
        
        #将所有权重和梯度整理到列表中
        layers=self.in_layers+[self.ns_loss]
        self.params,self.grads=[],[]
        for layer in layers:
            self.params+=layer.params
            self.grads+=layer.grads
            
        #将单词的分布式表示设置为成员变量
        self.word_vecs=W_in
        
    def forward(self,contexts,target):
        h=0
        for i,layer in enumerate(self.in_layers):
            h+=layer.forward(contexts[:,i])
        h *= 1/len(self.in_layers)
        loss=self.ns_loss.forward(h,target)
        return loss
    
    def backward(self,dout=1):
        dout=self.ns_loss.backward(dout)
        dout *= 1/len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        return None

CBOW模型的学习代码

In [15]:
import numpy as np
from common import config
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from common.util import create_contexts_target,to_cpu,to_gpu
from dataset import ptb

#设定超参数
window_size=5
hidden_size=100
batch_size=100
max_epoch=10

#读入数据
corpus,word_to_id,id_to_word=ptb.load_data('train')


In [16]:
from common.layers import SigmoidWithLoss

vocab_size=len(word_to_id)

contexts,target=create_contexts_target(corpus,window_size)
if config.GPU:
    contexts,target=to_gpu(contexts),to_gpu(target)
    
#生成模型
model=CBOW(vocab_size,hidden_size,window_size,corpus)
optimizer=Adam()
trainer=Trainer(model,optimizer)

#开始学习
trainer.fit(contexts,target,max_epoch,batch_size)
trainer.plot()

#保存必要数据，后续使用
word_vecs=model.word_vecs
if config.GPU:
    word_vecs=to_cpu(word_vecs)
params={}
params['word_vecs']=word_vecs.astype(np.float16)
params['word_to_id']=word_to_id
params['id_to_word']=id_to_word
pkl_file='cbow_params.pkl'  #pickle可将Python代码中的对象保存到文件中
with open(pkl_file,'wb')as f:
    pickle.dump(params,f,-1)

| epoch 1 |  iter 1 / 9295 | time 0[s] | loss 4.16
| epoch 1 |  iter 21 / 9295 | time 0[s] | loss 4.16
| epoch 1 |  iter 41 / 9295 | time 1[s] | loss 4.15
| epoch 1 |  iter 61 / 9295 | time 2[s] | loss 4.12
| epoch 1 |  iter 81 / 9295 | time 3[s] | loss 4.05
| epoch 1 |  iter 101 / 9295 | time 4[s] | loss 3.92
| epoch 1 |  iter 121 / 9295 | time 5[s] | loss 3.77
| epoch 1 |  iter 141 / 9295 | time 6[s] | loss 3.63
| epoch 1 |  iter 161 / 9295 | time 7[s] | loss 3.48
| epoch 1 |  iter 181 / 9295 | time 7[s] | loss 3.36
| epoch 1 |  iter 201 / 9295 | time 8[s] | loss 3.23
| epoch 1 |  iter 221 / 9295 | time 9[s] | loss 3.15
| epoch 1 |  iter 241 / 9295 | time 10[s] | loss 3.10
| epoch 1 |  iter 261 / 9295 | time 11[s] | loss 3.02
| epoch 1 |  iter 281 / 9295 | time 12[s] | loss 2.97
| epoch 1 |  iter 301 / 9295 | time 13[s] | loss 2.91
| epoch 1 |  iter 321 / 9295 | time 13[s] | loss 2.87
| epoch 1 |  iter 341 / 9295 | time 14[s] | loss 2.85
| epoch 1 |  iter 361 / 9295 | time 15[s] | lo

KeyboardInterrupt: 

CBOW模型的评价

In [17]:
from common.util import most_similar
import pickle

pkl_file='cbow_params.pkl'

with open(pkl_file,'rb') as f:
    params=pickle.load(f)
    word_vecs=params['word_vecs']
    word_to_id=params['word_to_id']
    id_to_word=params['id_to_word']

querys=['my','simple','dark','toyota']
for query in querys:
    most_similar(query,word_to_id,id_to_word,word_vecs,top=5)



[query] my
 your: 0.52197265625
 someone: 0.4853515625
 her: 0.464599609375
 his: 0.454833984375
 anybody: 0.450927734375

[query] simple
 word: 0.4677734375
 comfort: 0.44775390625
 fish: 0.42333984375
 whole: 0.41796875
 comic: 0.41748046875

[query] dark
 deck: 0.5244140625
 script: 0.5244140625
 walls: 0.51513671875
 demonstrators: 0.495361328125
 language: 0.4951171875

[query] toyota
 ford: 0.55078125
 instrumentation: 0.509765625
 mazda: 0.49365234375
 bethlehem: 0.47509765625
 nissan: 0.474853515625
