# 4章コードの補足

In [51]:
import sys
sys.path.append('..')
import numpy as np
import collections
from common.trainer import Trainer
from common.optimizer import Adam
from common.layers import Embedding, SigmoidWithLoss
from common.util import create_contexts_target
from cbow import CBOW
from negative_sampling_layer import EmbeddingDot, UnigramSampler, NegativeSamplingLoss
from dataset import ptb

# negative_sampling_layer.py

## EmbeddingDot
- CBOWモデルの多値分類を二値分類で近似するときに，中間層->出力層の処理を担うレイヤ．
- 正解単語のEmbedding，その単語ベクトルと中間層の値の内積を実行する．

### 初期化 init
- 引数として重みWを受け取る

In [3]:
W_out = np.random.rand(10, 3)  # 出力層側の重み. 語彙数10, word_vec_size=3の想定
embed = Embedding(W_out)  # Embeddingレイヤを生成
grads = embed.grads  # Embeddingレイヤの勾配を保持
cache = None  # backwardで使う値をfoward時に保持する変数

### forward
- 引数の h は中間層のニューロン，idx は正解単語IDの配列

In [4]:
h = np.random.rand(5, 3)  # 中間層のニューロン. batch_size=5, word_vec_size=3 の想定．
idx = np.array([0, 1, 2, 0, 5])  # 正解の単語ID
print(f'中間層 h: \n {h}')
print(f'正解単語ID idx: \n {idx}')

中間層 h: 
 [[0.83411958 0.80611499 0.08915327]
 [0.10749059 0.09967435 0.74493453]
 [0.15294117 0.97012438 0.59760582]
 [0.1137854  0.01545975 0.06701543]
 [0.18291014 0.34758461 0.28766537]]
正解単語ID idx: 
 [0 1 2 0 5]


In [5]:
target_W_out = embed.forward(idx)  # 正解単語の重みのみを抜き出す
print(f'W_out: \n {W_out}')
print(f'target_W_out: \n {target_W_out}')

W_out: 
 [[0.99611508 0.03277631 0.7585744 ]
 [0.98676654 0.60130009 0.09334825]
 [0.9419258  0.17370758 0.84883158]
 [0.51508839 0.79259116 0.11031159]
 [0.5756207  0.97809488 0.29981932]
 [0.34321374 0.82751926 0.78379805]
 [0.17074958 0.17908684 0.59317549]
 [0.45425244 0.51411358 0.31710341]
 [0.43557202 0.50100548 0.08788333]
 [0.81053023 0.71991485 0.88399548]]
target_W_out: 
 [[0.99611508 0.03277631 0.7585744 ]
 [0.98676654 0.60130009 0.09334825]
 [0.9419258  0.17370758 0.84883158]
 [0.99611508 0.03277631 0.7585744 ]
 [0.34321374 0.82751926 0.78379805]]


In [6]:
out = np.sum(target_W_out * h, axis=1)  # 正解単語の重みと中間層の内積計算
print(f'out: \n {out}')

out: 
 [0.92492996 0.23554065 0.81984389 0.16468626 0.57588179]


In [7]:
cache = (h, target_W_out)  # backward用

### backward
- 勾配 dout を受け取る

In [8]:
dout = np.random.rand(*out.shape)
print(f'dout: \n {dout}')

dout: 
 [0.65575287 0.60481655 0.28277463 0.68526189 0.59587463]


In [9]:
h, target_W_out = cache

In [10]:
dout = dout.reshape(dout.shape[0], 1)  # 二次元に変換
print(f'reshaped dout: \n {dout}')

reshaped dout: 
 [[0.65575287]
 [0.60481655]
 [0.28277463]
 [0.68526189]
 [0.59587463]]


In [11]:
dtarget_W_out = dout * h  # 内積の逆伝播
print(f'dtarget_W_out: \n {dtarget_W_out}')

dtarget_W_out: 
 [[0.54697631 0.52861222 0.05846251]
 [0.06501209 0.0602847  0.45054873]
 [0.04324788 0.27432656 0.16898776]
 [0.0779728  0.01059398 0.04592312]
 [0.10899152 0.20711685 0.1714125 ]]


In [12]:
print(f'grads: \n {grads}', end='\n\n')
embed.backward(dtarget_W_out)  # Embeddingレイヤの逆伝播．勾配を更新．
print(f'updated grads: \n {grads}')

grads: 
 [array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])]

updated grads: 
 [array([[0.62494911, 0.53920619, 0.10438564],
       [0.06501209, 0.0602847 , 0.45054873],
       [0.04324788, 0.27432656, 0.16898776],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.10899152, 0.20711685, 0.1714125 ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ]])]


***
## UnigramSampler
- CBOWモデルの高速化の後半部分．
- 負例をランダムに抽出して学習させる際の選び方．
- コーパス中の単語の出現確率に従ってサンプリングする．

### 初期化 init

In [13]:
corpus = np.array([0, 1, 2, 3, 4, 5, 1, 0, 3, 4, 5, 0, 6])  # コーパスは単語IDのリスト
power = 0.75  # 確率に1未満で累乗し，低頻度の単語に下駄をはかせる
sample_size = 3  # サンプリングする数

# Counterでコーパス中の単語の出現回数をカウントできる
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
print(counts[0])

Counter({0: 3, 1: 2, 3: 2, 4: 2, 5: 2, 2: 1, 6: 1})
3


In [14]:
vocab_size = len(counts)  # 語彙数 = countsの長さ

p = np.zeros(vocab_size)  # 語彙数と同じ要素数の配列で確率を保持する

# 各単語IDの出現回数を格納
for i in range(vocab_size):
    p[i] = counts[i]

# 出現回数を0.75乗して稀な単語の確率に少し下駄をはかせる
word_p = np.power(p, power)
print(f'original p: {p}')
print(f'powerd word_p: {word_p}', end='\n\n')

# np.sum(p) = 単語数 で割って確率にする
p /= np.sum(p)
word_p /= np.sum(word_p)
print(f'p_out: {p}')
print(f'word_p_out: {word_p}')

original p: [3. 2. 1. 2. 2. 2. 1.]
powerd word_p: [2.27950706 1.68179283 1.         1.68179283 1.68179283 1.68179283
 1.        ]

p_out: [0.23076923 0.15384615 0.07692308 0.15384615 0.15384615 0.15384615
 0.07692308]
word_p_out: [0.20710218 0.15279749 0.09085393 0.15279749 0.15279749 0.15279749
 0.09085393]


### get_negative_sample

In [15]:
# コンテキストとターゲットを作る
window_size = 1
contexts, target = create_contexts_target(corpus, window_size)

In [16]:
batch_size = target.shape[0]

In [17]:
negative_sample = np.zeros((batch_size, sample_size), dtype=np.int32)
print(negative_sample)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [18]:
for i in range(batch_size):
    p = word_p.copy()  # 確率を取得
    target_idx = target[i]  # ターゲットを保持
    p[target_idx] = 0  # ターゲットの確率は0
    p /= p.sum()  # ターゲットを除いて確率を再計算
    negative_sample[i, :] = np.random.choice(vocab_size, size=sample_size, replace=True, p=word_p)

In [19]:
print(negative_sample)

[[0 0 0]
 [5 3 0]
 [0 6 1]
 [3 6 0]
 [0 3 0]
 [1 5 5]
 [3 1 4]
 [0 3 3]
 [3 0 1]
 [4 5 1]
 [3 0 4]]


***
## NegativeSamplingLoss
- EmbeddingDotレイヤ，UnigramSamplerレイヤ，SigmoidWithLossレイヤの組み合わせ
- forwardでは中間層のニューロンとターゲットから損失関数を計算する

### 初期化 init
- EmbeddingDotレイヤに使う重みW，UnigramSamplerで確率計算に使うcorpus，指数powerとsample_sizeを引数として受け取る

In [20]:
print('=== 初期化で使う引数 ===')
print(f'W_out: \n {W_out}', end='\n\n')  # 出力側の単語ベクトルとなる重み．語彙数 x word_vec_size
print(f'corpus: {corpus}', end='\n\n')  # corpusは単語IDの配列
power = 0.75
print(f'power: {power}', end='\n\n')
sample_size = 3
print(f'sample_size: {sample_size}', end='\n\n')

print('=========================')
print(f'contexts: \n {contexts}', end='\n\n')  # contextsは単語IDの二次元配列
print(f'target: \n {target}', end='\n\n')  # targetは単語IDの配列
batch_size = target.shape[0]
h = np.random.rand(batch_size, 3)   # 中間層のニューロン
print(f'h: \n {h}')

=== 初期化で使う引数 ===
W_out: 
 [[0.99611508 0.03277631 0.7585744 ]
 [0.98676654 0.60130009 0.09334825]
 [0.9419258  0.17370758 0.84883158]
 [0.51508839 0.79259116 0.11031159]
 [0.5756207  0.97809488 0.29981932]
 [0.34321374 0.82751926 0.78379805]
 [0.17074958 0.17908684 0.59317549]
 [0.45425244 0.51411358 0.31710341]
 [0.43557202 0.50100548 0.08788333]
 [0.81053023 0.71991485 0.88399548]]

corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]

power: 0.75

sample_size: 3

contexts: 
 [[0 2]
 [1 3]
 [2 4]
 [3 5]
 [4 1]
 [5 0]
 [1 3]
 [0 4]
 [3 5]
 [4 0]
 [5 6]]

target: 
 [1 2 3 4 5 1 0 3 4 5 0]

h: 
 [[0.78355813 0.5455146  0.36335794]
 [0.57311126 0.59320848 0.45831957]
 [0.97060573 0.48938365 0.3253681 ]
 [0.14599264 0.17278998 0.63636172]
 [0.0391553  0.84676493 0.80237627]
 [0.93328339 0.7330196  0.04026047]
 [0.48768335 0.69470129 0.32374179]
 [0.04389739 0.68299985 0.71384733]
 [0.53738338 0.27286231 0.87685294]
 [0.99973435 0.8017193  0.88656996]
 [0.70103432 0.117336   0.70193148]]


In [21]:
sampler = UnigramSampler(corpus, power, sample_size)  # UnigramSampler初期化
loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要
embed_dot_layers = [EmbeddingDot(W_out) for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要

In [22]:
# embed_dot_layersのパラメータと勾配をリストにまとめる
params, grads = [], []
for layer in embed_dot_layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [23]:
print('重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)', end='\n\n')
print(f'length of params: {len(params)}')
print(f'length of grads : {len(grads)}')

重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)

length of params: 4
length of grads : 4


### forward

In [24]:
batch_size = target.shape[0]
print(f'target: \n {target}')
print(f'batch_size: \n {batch_size}')

target: 
 [1 2 3 4 5 1 0 3 4 5 0]
batch_size: 
 11


In [25]:
# 負例(targetではない単語)をコーパス中の確率に応じてサンプリングする
negative_sample = sampler.get_negative_sample(target)
print(f'negative_sample: \n {negative_sample}')

negative_sample: 
 [[3 4 5]
 [1 3 0]
 [2 1 4]
 [5 2 0]
 [0 3 4]
 [0 3 2]
 [5 6 4]
 [2 6 0]
 [2 5 1]
 [3 6 0]
 [2 6 3]]


In [26]:
# 正例のフォワード
# embed_dot_layersとloss_layersの0番目を正例のレイヤとしている
score = embed_dot_layers[0].forward(h, target)  # EmbeddingDotレイヤのforward
correct_label = np.ones(batch_size, dtype=np.int32)  # 正例は正解ラベルとして1を渡す
loss = loss_layers[0].forward(score, correct_label)  # SigmoidWithLossレイヤ

In [27]:
# 負例のフォワード
# embed_dot_layersとloss_layersの1番目以降を負例のレイヤとしている
negative_label = np.zeros(batch_size, dtype=np.int32)  # 負例は正解ラベルとして0を渡す
for i in range(sample_size):
    negative_target = negative_sample[:, i]  # サンプルサイズの回数ループ．バッチ処理．
    score = embed_dot_layers[i+1].forward(h, negative_target)  # EmbeddingDotレイヤのforward
    loss += loss_layers[i+1].forward(score, negative_label)  # SigmoidWithLossレイヤのforward

In [28]:
print(f'loss: \n {loss}')

loss: 
 4.129987417790073


### backward

In [29]:
dout = 1

In [30]:
# dhを初期化
dh = 0

# SigmoidWithLossレイヤ->EmbeddingDotレイヤの順にbackward
for l0, l1 in zip(loss_layers, embed_dot_layers):
    dscore = l0.backward(dout)
    dh += l1.backward(dscore)  # hの順伝播はリピートノードなので，逆伝播では足し合わせる
    
print(f'dh: \n {dh}')
print(f'h : \n {h}')

dh: 
 [[ 0.07328004  0.15925165  0.07759551]
 [ 0.14001219  0.08753607  0.04238783]
 [ 0.16345309  0.10194805  0.08473833]
 [ 0.11686795  0.02790531  0.13357955]
 [ 0.12483508  0.10237117  0.05823222]
 [ 0.14578711  0.05632051  0.11295732]
 [ 0.04194403  0.12972226  0.08248242]
 [ 0.11079692 -0.00178703  0.12810253]
 [ 0.13630181  0.07941869  0.11162662]
 [ 0.1189172   0.05826809  0.09272676]
 [ 0.08590326  0.06740834  0.08539176]]
h : 
 [[0.78355813 0.5455146  0.36335794]
 [0.57311126 0.59320848 0.45831957]
 [0.97060573 0.48938365 0.3253681 ]
 [0.14599264 0.17278998 0.63636172]
 [0.0391553  0.84676493 0.80237627]
 [0.93328339 0.7330196  0.04026047]
 [0.48768335 0.69470129 0.32374179]
 [0.04389739 0.68299985 0.71384733]
 [0.53738338 0.27286231 0.87685294]
 [0.99973435 0.8017193  0.88656996]
 [0.70103432 0.117336   0.70193148]]


***
# cbow.py
### CBOW
- 改良版CBOWモデル．
- EmbeddingレイヤとNegativeSamplingLossレイヤを使う．

### 初期化 init

In [31]:
hidden_size = 3  # embeddingした単語ベクトルの次元
window_size = 2  # targetの両側いくつか？

print('===引数===')
print(f'vocab_size: {vocab_size}')
print(f'hidden_size: {hidden_size}')
print(f'corpus: {corpus}')
print(f'window_size: {window_size}')

===引数===
vocab_size: 7
hidden_size: 3
corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]
window_size: 2


In [32]:
# 重みの初期化
V, H = vocab_size, hidden_size
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(V, H).astype('f')

In [33]:
# レイヤの生成
in_layers = []
for i in range(window_size*2):
    layer = Embedding(W_in)
    in_layers.append(layer)
ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=2)

In [34]:
# 重みと勾配をリストで管理する
layers = in_layers + [ns_loss]
params, grads = [], []
for layer in layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [35]:
# インスタンス変数に単語の分散表現を設定
word_vecs = W_in

### forward

In [36]:
contexts, targets = create_contexts_target(corpus, window_size)
print('contexts')
print(contexts)
print('targets')
print(targets)

contexts
[[0 1 3 4]
 [1 2 4 5]
 [2 3 5 1]
 [3 4 1 0]
 [4 5 0 3]
 [5 1 3 4]
 [1 0 4 5]
 [0 3 5 0]
 [3 4 0 6]]
targets
[2 3 4 5 1 0 3 4 5]


In [37]:
h = 0
for i, layer in enumerate(in_layers):
    h += layer.forward(contexts[:, i])  # Embeddingレイヤの順伝播
h *= 1 / len(in_layers)  # コンテキスト全体の平均を中間層のニューロンとする
loss = ns_loss.forward(h, targets)
print(loss)

2.0794656011793347


### backward

In [38]:
print('========= before backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

dout = 1
dout = ns_loss.backward(dout)
dout *= 1 / len(in_layers)  # コンテキスト一つ分の損失にする
for layer in in_layers:
    layer.backward(dout)
    
print('========= after backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

grad of ns_loss
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of in_layer_0
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of ns_loss
[array([[ 3.9458770e-04, -2.0558155e-05, -1.4033174e-04],
       [ 3.7436336e-04, -3.0306150e-05,  1.3142229e-04],
       [ 1.8102820e-04, -5.7447000e-05, -1.7221703e-04],
       [ 3.0923515e-04,  1.4187662e-04, -4.2570848e-04],
       [ 2.4472128e-04,  5.3241127e-04,  7.4741989e-04],
     

***
# train.py
- 改良版CBOWモデルの学習コード

In [64]:
# ハイパーパラメータ設定
window_size = 1
hidden_size = 10
batch_size = 10
max_epoch = 5

In [65]:
# データ読み込み
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)

In [66]:
# モデル生成
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

In [74]:
print(type(model))
print(f'num of in_layers: {len(model.in_layers)}')  # window_size=2なのでin_layerは2つ
print(f'num of params of a in_layer: {len(model.in_layers[0].params)}')  # Embeddingレイヤは重み1つ
# NegativeSamplingLossレイヤのEmbeddingDotレイヤに重みが1つ
# sample_sizeのdefault値5+1(正例分)
print(f'num of params of ns_loss: {len(model.ns_loss.params)}')
print(f'num of params: {len(model.params)}')  # 計8つのパラメータ

<class 'cbow.CBOW'>
num of in_layers: 2
num of params of a in_layer: 1
num of params of ns_loss: 6
num of params: 8
