# 4章コードの補足

In [175]:
import sys
sys.path.append('..')
import numpy as np
import collections
from common.layers import Embedding, SigmoidWithLoss
from common.util import create_contexts_target
from negative_sampling_layer import EmbeddingDot, UnigramSampler, NegativeSamplingLoss

# negative_sampling_layer.py

## EmbeddingDot
- CBOWモデルの多値分類を二値分類で近似するときに，中間層->出力層の処理を担うレイヤ．
- 正解単語のEmbedding，その単語ベクトルと中間層の値の内積を実行する．

### 初期化 init
- 引数として重みWを受け取る

In [176]:
W_out = np.random.rand(10, 3)  # 出力層側の重み. 語彙数10, word_vec_size=3の想定
embed = Embedding(W_out)  # Embeddingレイヤを生成
grads = embed.grads  # Embeddingレイヤの勾配を保持
cache = None  # backwardで使う値をfoward時に保持する変数

### forward
- 引数の h は中間層のニューロン，idx は正解単語IDの配列

In [177]:
h = np.random.rand(5, 3)  # 中間層のニューロン. batch_size=5, word_vec_size=3 の想定．
idx = np.array([0, 1, 2, 0, 5])  # 正解の単語ID
print(f'中間層 h: \n {h}')
print(f'正解単語ID idx: \n {idx}')

中間層 h: 
 [[0.80350185 0.60052725 0.05609662]
 [0.28917538 0.37431915 0.48310407]
 [0.70374215 0.34036285 0.4686944 ]
 [0.67850496 0.20096729 0.07690045]
 [0.56121228 0.84712113 0.66834159]]
正解単語ID idx: 
 [0 1 2 0 5]


In [178]:
target_W_out = embed.forward(idx)  # 正解単語の重みのみを抜き出す
print(f'W_out: \n {W_out}')
print(f'target_W_out: \n {target_W_out}')

W_out: 
 [[0.42033671 0.80675672 0.62130127]
 [0.80137914 0.64169277 0.5595805 ]
 [0.21548923 0.56463616 0.48042361]
 [0.29726869 0.96336842 0.20021747]
 [0.30847581 0.08188968 0.28270171]
 [0.21747332 0.45931022 0.29034645]
 [0.74353172 0.64695558 0.88687812]
 [0.79125092 0.65129791 0.23946311]
 [0.80483524 0.46378042 0.27779781]
 [0.19622793 0.01250582 0.83777769]]
target_W_out: 
 [[0.42033671 0.80675672 0.62130127]
 [0.80137914 0.64169277 0.5595805 ]
 [0.21548923 0.56463616 0.48042361]
 [0.42033671 0.80675672 0.62130127]
 [0.21747332 0.45931022 0.29034645]]


In [179]:
out = np.sum(target_W_out * h, axis=1)  # 正解単語の重みと中間層の内積計算
print(f'out: \n {out}')

out: 
 [0.85707363 0.74227263 0.56900188 0.4951106  0.7051907 ]


In [180]:
cache = (h, target_W_out)  # backward用

### backward
- 勾配 dout を受け取る

In [181]:
dout = np.random.rand(*out.shape)
print(f'dout: \n {dout}')

dout: 
 [0.49593873 0.1242032  0.36082813 0.26037497 0.45331597]


In [182]:
h, target_W_out = cache

In [183]:
dout = dout.reshape(dout.shape[0], 1)  # 二次元に変換
print(f'reshaped dout: \n {dout}')

reshaped dout: 
 [[0.49593873]
 [0.1242032 ]
 [0.36082813]
 [0.26037497]
 [0.45331597]]


In [184]:
dtarget_W_out = dout * h  # 内積の逆伝播
print(f'dtarget_W_out: \n {dtarget_W_out}')

dtarget_W_out: 
 [[0.39848769 0.29782472 0.02782049]
 [0.03591651 0.04649164 0.06000307]
 [0.25392997 0.12281249 0.16911812]
 [0.17666571 0.05232685 0.02002295]
 [0.25440649 0.38401354 0.30296992]]


In [185]:
print(f'grads: \n {grads}', end='\n\n')
embed.backward(dtarget_W_out)  # Embeddingレイヤの逆伝播．勾配を更新．
print(f'updated grads: \n {grads}')

grads: 
 [array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])]

updated grads: 
 [array([[0.57515339, 0.35015157, 0.04784344],
       [0.03591651, 0.04649164, 0.06000307],
       [0.25392997, 0.12281249, 0.16911812],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.25440649, 0.38401354, 0.30296992],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ]])]


***
## UnigramSampler
- CBOWモデルの高速化の後半部分．
- 負例をランダムに抽出して学習させる際の選び方．
- コーパス中の単語の出現確率に従ってサンプリングする．

### 初期化 init

In [186]:
corpus = np.array([0, 1, 2, 3, 4, 5, 1, 0, 3, 4, 5, 0, 6])  # コーパスは単語IDのリスト
power = 0.75  # 確率に1未満で累乗し，低頻度の単語に下駄をはかせる
sample_size = 3  # サンプリングする数

# Counterでコーパス中の単語の出現回数をカウントできる
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
print(counts[0])

Counter({0: 3, 1: 2, 3: 2, 4: 2, 5: 2, 2: 1, 6: 1})
3


In [187]:
vocab_size = len(counts)  # 語彙数 = countsの長さ

p = np.zeros(vocab_size)  # 語彙数と同じ要素数の配列で確率を保持する

# 各単語IDの出現回数を格納
for i in range(vocab_size):
    p[i] = counts[i]

# 出現回数を0.75乗して稀な単語の確率に少し下駄をはかせる
word_p = np.power(p, power)
print(f'original p: {p}')
print(f'powerd word_p: {word_p}', end='\n\n')

# np.sum(p) = 単語数 で割って確率にする
p /= np.sum(p)
word_p /= np.sum(word_p)
print(f'p_out: {p}')
print(f'word_p_out: {word_p}')

original p: [3. 2. 1. 2. 2. 2. 1.]
powerd word_p: [2.27950706 1.68179283 1.         1.68179283 1.68179283 1.68179283
 1.        ]

p_out: [0.23076923 0.15384615 0.07692308 0.15384615 0.15384615 0.15384615
 0.07692308]
word_p_out: [0.20710218 0.15279749 0.09085393 0.15279749 0.15279749 0.15279749
 0.09085393]


### get_negative_sample

In [188]:
# コンテキストとターゲットを作る
window_size = 1
contexts, target = create_contexts_target(corpus, window_size)

In [189]:
batch_size = target.shape[0]

In [190]:
negative_sample = np.zeros((batch_size, sample_size), dtype=np.int32)
print(negative_sample)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [191]:
for i in range(batch_size):
    p = word_p.copy()  # 確率を取得
    target_idx = target[i]  # ターゲットを保持
    p[target_idx] = 0  # ターゲットの確率は0
    p /= p.sum()  # ターゲットを除いて確率を再計算
    negative_sample[i, :] = np.random.choice(vocab_size, size=sample_size, replace=True, p=word_p)

In [192]:
print(negative_sample)

[[0 0 5]
 [4 4 1]
 [6 2 3]
 [0 1 1]
 [1 4 6]
 [0 5 3]
 [2 6 6]
 [2 1 0]
 [0 5 3]
 [1 4 5]
 [0 4 6]]


***
## NegativeSamplingLoss
- EmbeddingDotレイヤ，UnigramSamplerレイヤ，SigmoidWithLossレイヤの組み合わせ
- forwardでは中間層のニューロンとターゲットから損失関数を計算する

### 初期化 init
- EmbeddingDotレイヤに使う重みW，UnigramSamplerで確率計算に使うcorpus，指数powerとsample_sizeを引数として受け取る

In [193]:
print('=== 初期化で使う引数 ===')
print(f'W_out: \n {W_out}', end='\n\n')  # 出力側の単語ベクトルとなる重み．語彙数 x word_vec_size
print(f'corpus: {corpus}', end='\n\n')  # corpusは単語IDの配列
power = 0.75
print(f'power: {power}', end='\n\n')
sample_size = 3
print(f'sample_size: {sample_size}', end='\n\n')

print('=========================')
print(f'contexts: \n {contexts}', end='\n\n')  # contextsは単語IDの二次元配列
print(f'target: \n {target}', end='\n\n')  # targetは単語IDの配列
batch_size = target.shape[0]
h = np.random.rand(batch_size, 3)   # 中間層のニューロン
print(f'h: \n {h}')

=== 初期化で使う引数 ===
W_out: 
 [[0.42033671 0.80675672 0.62130127]
 [0.80137914 0.64169277 0.5595805 ]
 [0.21548923 0.56463616 0.48042361]
 [0.29726869 0.96336842 0.20021747]
 [0.30847581 0.08188968 0.28270171]
 [0.21747332 0.45931022 0.29034645]
 [0.74353172 0.64695558 0.88687812]
 [0.79125092 0.65129791 0.23946311]
 [0.80483524 0.46378042 0.27779781]
 [0.19622793 0.01250582 0.83777769]]

corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]

power: 0.75

sample_size: 3

contexts: 
 [[0 2]
 [1 3]
 [2 4]
 [3 5]
 [4 1]
 [5 0]
 [1 3]
 [0 4]
 [3 5]
 [4 0]
 [5 6]]

target: 
 [1 2 3 4 5 1 0 3 4 5 0]

h: 
 [[0.20492465 0.75936608 0.6827763 ]
 [0.34191755 0.45416866 0.37110143]
 [0.52786969 0.4027141  0.71930598]
 [0.1190902  0.00528928 0.49815719]
 [0.33127124 0.32771372 0.32955939]
 [0.70687755 0.22116059 0.45107849]
 [0.73416129 0.46468047 0.36414281]
 [0.55719554 0.14669425 0.06622113]
 [0.98160422 0.67501499 0.89157764]
 [0.92863165 0.42049922 0.31162513]
 [0.4942497  0.45484971 0.91184724]]


In [194]:
sampler = UnigramSampler(corpus, power, sample_size)  # UnigramSampler初期化
loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要
embed_dot_layers = [EmbeddingDot(W_out) for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要

In [195]:
# embed_dot_layersのパラメータと勾配をリストにまとめる
params, grads = [], []
for layer in embed_dot_layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [196]:
print('重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)', end='\n\n')
print(f'length of params: {len(params)}')
print(f'length of grads : {len(grads)}')

重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)

length of params: 4
length of grads : 4


### forward

In [197]:
batch_size = target.shape[0]
print(f'target: \n {target}')
print(f'batch_size: \n {batch_size}')

target: 
 [1 2 3 4 5 1 0 3 4 5 0]
batch_size: 
 11


In [198]:
# 負例(targetではない単語)をコーパス中の確率に応じてサンプリングする
negative_sample = sampler.get_negative_sample(target)
print(f'negative_sample: \n {negative_sample}')

negative_sample: 
 [[0 4 3]
 [1 4 0]
 [0 4 2]
 [0 3 5]
 [6 3 0]
 [2 3 0]
 [5 4 1]
 [1 5 2]
 [1 0 3]
 [4 3 0]
 [4 3 6]]


In [199]:
# 正例のフォワード
# embed_dot_layersとloss_layersの0番目を正例のレイヤとしている
score = embed_dot_layers[0].forward(h, target)  # EmbeddingDotレイヤのforward
correct_label = np.ones(batch_size, dtype=np.int32)  # 正例は正解ラベルとして1を渡す
loss = loss_layers[0].forward(score, correct_label)  # SigmoidWithLossレイヤ

In [200]:
# 負例のフォワード
# embed_dot_layersとloss_layersの1番目以降を負例のレイヤとしている
negative_label = np.zeros(batch_size, dtype=np.int32)  # 負例は正解ラベルとして0を渡す
for i in range(sample_size):
    negative_target = negative_sample[:, i]  # サンプルサイズの回数ループ．バッチ処理．
    score = embed_dot_layers[i+1].forward(h, negative_target)  # EmbeddingDotレイヤのforward
    loss += loss_layers[i+1].forward(score, negative_label)  # SigmoidWithLossレイヤのforward

In [201]:
print(f'loss: \n {loss}')

loss: 
 3.7561602905079527


### backward

In [202]:
dout = 1

In [203]:
# dhを初期化
dh = 0

# SigmoidWithLossレイヤ->EmbeddingDotレイヤの順にbackward
for l0, l1 in zip(loss_layers, embed_dot_layers):
    dscore = l0.backward(dout)
    dh += l1.backward(dscore)  # hの順伝播はリピートノードなので，逆伝播では足し合わせる
    
print(f'dh: \n {dh}')
print(f'h : \n {h}')

dh: 
 [[0.04532614 0.1071281  0.05719736]
 [0.0841118  0.07447839 0.07108188]
 [0.04866971 0.06285227 0.07957215]
 [0.03496314 0.10941099 0.04568705]
 [0.07903256 0.12396893 0.09147545]
 [0.03491494 0.12043171 0.06286656]
 [0.07233327 0.05278868 0.05334319]
 [0.05714568 0.05223116 0.06388118]
 [0.10358789 0.17297228 0.09408668]
 [0.0549831  0.10041509 0.0581471 ]
 [0.08127711 0.09442739 0.07999488]]
h : 
 [[0.20492465 0.75936608 0.6827763 ]
 [0.34191755 0.45416866 0.37110143]
 [0.52786969 0.4027141  0.71930598]
 [0.1190902  0.00528928 0.49815719]
 [0.33127124 0.32771372 0.32955939]
 [0.70687755 0.22116059 0.45107849]
 [0.73416129 0.46468047 0.36414281]
 [0.55719554 0.14669425 0.06622113]
 [0.98160422 0.67501499 0.89157764]
 [0.92863165 0.42049922 0.31162513]
 [0.4942497  0.45484971 0.91184724]]


***
# cbow.py
### CBOW
- 改良版CBOWモデル．
- EmbeddingレイヤとNegativeSamplingLossレイヤを使う．

### 初期化 init

In [204]:
hidden_size = 3  # embeddingした単語ベクトルの次元
window_size = 2  # targetの両側いくつか？

print('===引数===')
print(f'vocab_size: {vocab_size}')
print(f'hidden_size: {hidden_size}')
print(f'corpus: {corpus}')
print(f'window_size: {window_size}')

===引数===
vocab_size: 7
hidden_size: 3
corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]
window_size: 2


In [205]:
# 重みの初期化
V, H = vocab_size, hidden_size
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(V, H).astype('f')

In [206]:
# レイヤの生成
in_layers = []
for i in range(window_size*2):
    layer = Embedding(W_in)
    in_layers.append(layer)
ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=2)

In [207]:
# 重みと勾配をリストで管理する
layers = in_layers + [ns_loss]
params, grads = [], []
for layer in layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [208]:
# インスタンス変数に単語の分散表現を設定
word_vecs = W_in

### forward

In [209]:
contexts, targets = create_contexts_target(corpus, window_size)
print('contexts')
print(contexts)
print('targets')
print(targets)

contexts
[[0 1 3 4]
 [1 2 4 5]
 [2 3 5 1]
 [3 4 1 0]
 [4 5 0 3]
 [5 1 3 4]
 [1 0 4 5]
 [0 3 5 0]
 [3 4 0 6]]
targets
[2 3 4 5 1 0 3 4 5]


In [210]:
h = 0
for i, layer in enumerate(in_layers):
    h += layer.forward(contexts[:, i])  # Embeddingレイヤの順伝播
h *= 1 / len(in_layers)  # コンテキスト全体の平均を中間層のニューロンとする
loss = ns_loss.forward(h, targets)
print(loss)

2.079430050320095


### backward

In [211]:
print('========= before backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

dout = 1
dout = ns_loss.backward(dout)
dout *= 1 / len(in_layers)  # コンテキスト一つ分の損失にする
for layer in in_layers:
    layer.backward(dout)
    
print('========= after backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

grad of ns_loss
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of in_layer_0
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of ns_loss
[array([[ 5.0659892e-05, -1.6392178e-05,  3.8957217e-04],
       [-2.2350028e-04,  3.9279266e-04,  3.1018417e-04],
       [-1.0725167e-04, -1.1440896e-05,  1.2945100e-04],
       [ 4.1869315e-04, -1.7548105e-04,  7.1119406e-04],
       [ 2.1207710e-04,  3.1660034e-04, -1.5085253e-04],
     