# 4章コードの補足

In [1]:
import sys
sys.path.append('..')
import numpy as np
import collections
from common.layers import Embedding, SigmoidWithLoss
from common.util import create_contexts_target
from negative_sampling_layer import EmbeddingDot, UnigramSampler, NegativeSamplingLoss

# negative_sampling_layer

***
## EmbeddingDot
- CBOWモデルの多値分類を二値分類で近似するときに，中間層->出力層の処理を担うレイヤ．
- 正解単語のEmbedding，その単語ベクトルと中間層の値の内積を実行する．

### 初期化 init
- 引数として重みWを受け取る

In [2]:
W_out = np.random.rand(10, 3)  # 出力層側の重み. 語彙数10, word_vec_size=3の想定
embed = Embedding(W_out)  # Embeddingレイヤを生成
grads = embed.grads  # Embeddingレイヤの勾配を保持
cache = None  # backwardで使う値をfoward時に保持する変数

### forward
- 引数の h は中間層のニューロン，idx は正解単語IDの配列

In [3]:
h = np.random.rand(5, 3)  # 中間層のニューロン. batch_size=5, word_vec_size=3 の想定．
idx = np.array([0, 1, 2, 0, 5])  # 正解の単語ID
print(f'中間層 h: \n {h}')
print(f'正解単語ID idx: \n {idx}')

中間層 h: 
 [[0.69131206 0.12199617 0.83288524]
 [0.30376944 0.90889519 0.86545502]
 [0.42530876 0.15244514 0.45250241]
 [0.01374559 0.8685118  0.1072651 ]
 [0.76555321 0.96973996 0.70623601]]
正解単語ID idx: 
 [0 1 2 0 5]


In [4]:
target_W_out = embed.forward(idx)  # 正解単語の重みのみを抜き出す
print(f'W_out: \n {W_out}')
print(f'target_W_out: \n {target_W_out}')

W_out: 
 [[0.57095902 0.88578706 0.16310808]
 [0.28086643 0.98137859 0.06785615]
 [0.97002936 0.6671     0.31561775]
 [0.60742155 0.42754972 0.10767568]
 [0.13168318 0.67683056 0.09379592]
 [0.83004069 0.80925757 0.4974298 ]
 [0.08318498 0.57717464 0.72581702]
 [0.52045138 0.3135339  0.77926451]
 [0.98103537 0.25710681 0.93057164]
 [0.31986886 0.64133897 0.36636446]]
target_W_out: 
 [[0.57095902 0.88578706 0.16310808]
 [0.28086643 0.98137859 0.06785615]
 [0.97002936 0.6671     0.31561775]
 [0.57095902 0.88578706 0.16310808]
 [0.83004069 0.80925757 0.4974298 ]]


In [5]:
out = np.sum(target_W_out * h, axis=1)  # 正解単語の重みと中間層の内積計算
print(f'out: \n {out}')

out: 
 [0.6386238  1.03601536 0.65707593 0.79466049 1.77151255]


In [6]:
cache = (h, target_W_out)  # backward用

### backward
- 勾配 dout を受け取る

In [7]:
dout = np.random.rand(*out.shape)
print(f'dout: \n {dout}')

dout: 
 [0.85448046 0.82235243 0.18498158 0.72358886 0.38614443]


In [8]:
h, target_W_out = cache

In [9]:
dout = dout.reshape(dout.shape[0], 1)  # 二次元に変換
print(f'reshaped dout: \n {dout}')

reshaped dout: 
 [[0.85448046]
 [0.82235243]
 [0.18498158]
 [0.72358886]
 [0.38614443]]


In [10]:
dtarget_W_out = dout * h  # 内積の逆伝播
print(f'dtarget_W_out: \n {dtarget_W_out}')

dtarget_W_out: 
 [[0.59071265 0.10424335 0.71168416]
 [0.24980554 0.74743217 0.71170904]
 [0.07867429 0.02819954 0.08370461]
 [0.00994615 0.62844546 0.07761583]
 [0.2956141  0.37445968 0.2727091 ]]


In [11]:
print(f'grads: \n {grads}', end='\n\n')
embed.backward(dtarget_W_out)  # Embeddingレイヤの逆伝播．勾配を更新．
print(f'updated grads: \n {grads}')

grads: 
 [array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])]

updated grads: 
 [array([[0.6006588 , 0.73268881, 0.78929999],
       [0.24980554, 0.74743217, 0.71170904],
       [0.07867429, 0.02819954, 0.08370461],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.2956141 , 0.37445968, 0.2727091 ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ]])]


***
## UnigramSampler
- CBOWモデルの高速化の後半部分．
- 負例をランダムに抽出して学習させる際の選び方．
- コーパス中の単語の出現確率に従ってサンプリングする．

### 初期化 init

In [12]:
corpus = np.array([1, 1, 3, 2, 1, 1, 0, 3, 4, 5, 0, 0])  # コーパスは単語IDのリスト
power = 0.75  # 確率に1未満で累乗し，低頻度の単語に下駄をはかせる
sample_size = 3  # サンプリングする数

# Counterでコーパス中の単語の出現回数をカウントできる
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
print(counts[0])

Counter({1: 4, 0: 3, 3: 2, 2: 1, 4: 1, 5: 1})
3


In [13]:
vocab_size = len(counts)  # 語彙数 = countsの長さ

p = np.zeros(vocab_size)  # 語彙数と同じ要素数の配列で確率を保持する

# 各単語IDの出現回数を格納
for i in range(vocab_size):
    p[i] = counts[i]

# 出現回数を0.75乗して稀な単語の確率に少し下駄をはかせる
word_p = np.power(p, power)
print(f'original p: {p}')
print(f'powerd word_p: {word_p}', end='\n\n')

# np.sum(p) = 単語数 で割って確率にする
p /= np.sum(p)
word_p /= np.sum(word_p)
print(f'p_out: {p}')
print(f'word_p_out: {word_p}')

original p: [3. 4. 1. 2. 1. 1.]
powerd word_p: [2.27950706 2.82842712 1.         1.68179283 1.         1.        ]

p_out: [0.25       0.33333333 0.08333333 0.16666667 0.08333333 0.08333333]
word_p_out: [0.23284685 0.28891787 0.10214789 0.1717916  0.10214789 0.10214789]


### get_negative_sample

In [14]:
# コンテキストとターゲットを作る
window_size = 1
contexts, target = create_contexts_target(corpus, window_size)

In [15]:
batch_size = target.shape[0]

In [16]:
negative_sample = np.zeros((batch_size, sample_size), dtype=np.int32)
print(negative_sample)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [17]:
for i in range(batch_size):
    p = word_p.copy()  # 確率を取得
    target_idx = target[i]  # ターゲットを保持
    p[target_idx] = 0  # ターゲットの確率は0
    p /= p.sum()  # ターゲットを除いて確率を再計算
    negative_sample[i, :] = np.random.choice(vocab_size, size=sample_size, replace=True, p=word_p)

In [18]:
print(negative_sample)

[[0 3 3]
 [0 2 1]
 [4 1 1]
 [1 1 0]
 [1 1 5]
 [3 1 0]
 [1 3 3]
 [0 0 1]
 [1 4 3]
 [4 1 1]]


***
## NegativeSamplingLoss
- EmbeddingDotレイヤ，UnigramSamplerレイヤ，SigmoidWithLossレイヤの組み合わせ
- forwardでは中間層のニューロンとターゲットから損失関数を計算する

### 初期化 init
- EmbeddingDotレイヤに使う重みW，UnigramSamplerで確率計算に使うcorpus，指数powerとsample_sizeを引数として受け取る

In [67]:
print(f'W_out: \n {W_out}', end='\n\n')  # 出力側の単語ベクトルとなる重み．語彙数 x word_vec_size
print(f'corpus: \n {corpus}', end='\n\n')  # corpusは単語IDの配列
print(f'contexts: \n {contexts}', end='\n\n')  # contextsは単語IDの二次元配列
print(f'target: \n {target}', end='\n\n')  # targetは単語IDの配列
h = np.random.rand(10, 3)   # 中間層のニューロン
print(f'h: \n {h}')

W_out: 
 [[0.57095902 0.88578706 0.16310808]
 [0.28086643 0.98137859 0.06785615]
 [0.97002936 0.6671     0.31561775]
 [0.60742155 0.42754972 0.10767568]
 [0.13168318 0.67683056 0.09379592]
 [0.83004069 0.80925757 0.4974298 ]
 [0.08318498 0.57717464 0.72581702]
 [0.52045138 0.3135339  0.77926451]
 [0.98103537 0.25710681 0.93057164]
 [0.31986886 0.64133897 0.36636446]]

corpus: 
 [1 1 3 2 1 1 0 3 4 5 0 0]

contexts: 
 [[1 3]
 [1 2]
 [3 1]
 [2 1]
 [1 0]
 [1 3]
 [0 4]
 [3 5]
 [4 0]
 [5 0]]

target: 
 [1 3 2 1 1 0 3 4 5 0]

h: 
 [[0.57639247 0.13477406 0.72099445]
 [0.88410564 0.75463007 0.35209841]
 [0.05596927 0.21241603 0.46759997]
 [0.28323451 0.63438585 0.92653796]
 [0.42567917 0.55794762 0.0647181 ]
 [0.57546497 0.06877354 0.64367143]
 [0.60777673 0.20228706 0.72870647]
 [0.71337789 0.62738251 0.84823134]
 [0.49099186 0.60245709 0.39911593]
 [0.26432596 0.49271598 0.0830015 ]]


In [59]:
power = 0.75  # UnigramSamplerの確率計算に使用．稀な単語の確率に下駄をはかせる．
sample_size = 3  # 負例として3単語をサンプリングする設定
sampler = UnigramSampler(corpus, power, sample_size)  # UnigramSampler初期化
loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要
embed_dot_layers = [EmbeddingDot(W_out) for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要

In [60]:
# embed_dot_layersのパラメータと勾配をリストにまとめる
params, grads = [], []
for layer in embed_dot_layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [61]:
# 重みと勾配はEmbeddingDotレイヤの数だけある
print(f'length of params: {len(params)}')
print(f'length of grads : {len(grads)}')

length of params: 4
length of grads : 4


### forward

In [62]:
batch_size = target.shape[0]
print(batch_size)

10


In [63]:
# 負例(targetではない単語)をコーパス中の確率に応じてサンプリングする
negative_sample = sampler.get_negative_sample(target)
print(negative_sample)

[[3 0 2]
 [0 5 1]
 [5 1 0]
 [0 3 5]
 [0 2 3]
 [1 2 3]
 [4 0 1]
 [3 5 2]
 [1 4 3]
 [4 2 5]]


In [68]:
# 正例のフォワード
score = embed_dot_layers[0].forward(h, target)  # EmbeddingDotレイヤのforward
correct_label = np.ones(batch_size, dtype=np.int32)  # SigmoidWithLossに入れる正解ラベル
loss = loss_layers[0].forward(score, correct_label)

0.4310707531655287