# 4章コードの補足

In [22]:
import sys
sys.path.append('..')
import numpy as np
import collections
from common.layers import Embedding, SigmoidWithLoss
from common.util import create_contexts_target
from negative_sampling_layer import EmbeddingDot, UnigramSampler, NegativeSamplingLoss

# negative_sampling_layer

***
## EmbeddingDot
- CBOWモデルの多値分類を二値分類で近似するときに，中間層->出力層の処理を担うレイヤ．
- 正解単語のEmbedding，その単語ベクトルと中間層の値の内積を実行する．

### 初期化 init
- 引数として重みWを受け取る

In [2]:
W_out = np.random.rand(10, 3)  # 出力層側の重み. 語彙数10, word_vec_size=3の想定
embed = Embedding(W_out)  # Embeddingレイヤを生成
grads = embed.grads  # Embeddingレイヤの勾配を保持
cache = None  # backwardで使う値をfoward時に保持する変数

### forward
- 引数の h は中間層のニューロン，idx は正解単語IDの配列

In [3]:
h = np.random.rand(5, 3)  # 中間層のニューロン. batch_size=5, word_vec_size=3 の想定．
idx = np.array([0, 1, 2, 0, 5])  # 正解の単語ID
print(f'中間層 h: \n {h}')
print(f'正解単語ID idx: \n {idx}')

中間層 h: 
 [[0.58604158 0.9520668  0.2263102 ]
 [0.61805397 0.38801019 0.73331188]
 [0.68606143 0.72504169 0.40785919]
 [0.34843246 0.83562609 0.16880705]
 [0.58770282 0.23997507 0.64947012]]
正解単語ID idx: 
 [0 1 2 0 5]


In [4]:
target_W_out = embed.forward(idx)  # 正解単語の重みのみを抜き出す
print(f'W_out: \n {W_out}')
print(f'target_W_out: \n {target_W_out}')

W_out: 
 [[0.43108747 0.21280531 0.29782549]
 [0.80190639 0.67634727 0.38809073]
 [0.92393452 0.72890279 0.52477641]
 [0.8280952  0.61218108 0.89187467]
 [0.72089756 0.5200182  0.29476372]
 [0.51307117 0.84545686 0.83864454]
 [0.84172767 0.01887939 0.86639175]
 [0.57131253 0.53254022 0.65238355]
 [0.22800543 0.30880185 0.74129552]
 [0.46369241 0.50884812 0.18668578]]
target_W_out: 
 [[0.43108747 0.21280531 0.29782549]
 [0.80190639 0.67634727 0.38809073]
 [0.92393452 0.72890279 0.52477641]
 [0.43108747 0.21280531 0.29782549]
 [0.51307117 0.84545686 0.83864454]]


In [5]:
out = np.sum(target_W_out * h, axis=1)  # 正解単語の重みと中間層の内積計算
print(f'out: \n {out}')

out: 
 [0.522641   1.0426426  1.37639564 0.37830558 1.0490965 ]


In [6]:
cache = (h, target_W_out)  # backward用

### backward
- 勾配 dout を受け取る

In [7]:
dout = np.random.rand(*out.shape)
print(f'dout: \n {dout}')

dout: 
 [0.31748045 0.29540609 0.99564754 0.7983897  0.30694812]


In [8]:
h, target_W_out = cache

In [9]:
dout = dout.reshape(dout.shape[0], 1)  # 二次元に変換
print(f'reshaped dout: \n {dout}')

reshaped dout: 
 [[0.31748045]
 [0.29540609]
 [0.99564754]
 [0.7983897 ]
 [0.30694812]]


In [10]:
dtarget_W_out = dout * h  # 内積の逆伝播
print(f'dtarget_W_out: \n {dtarget_W_out}')

dtarget_W_out: 
 [[0.18605674 0.30226259 0.07184906]
 [0.18257691 0.11462057 0.2166248 ]
 [0.68307538 0.72188598 0.406084  ]
 [0.27818489 0.66715526 0.13477381]
 [0.18039428 0.0736599  0.19935363]]


In [11]:
print(f'grads: \n {grads}', end='\n\n')
embed.backward(dtarget_W_out)  # Embeddingレイヤの逆伝播．勾配を更新．
print(f'updated grads: \n {grads}')

grads: 
 [array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])]

updated grads: 
 [array([[0.46424163, 0.96941786, 0.20662288],
       [0.18257691, 0.11462057, 0.2166248 ],
       [0.68307538, 0.72188598, 0.406084  ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.18039428, 0.0736599 , 0.19935363],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ]])]


***
## UnigramSampler
- CBOWモデルの高速化の後半部分．
- 負例をランダムに抽出して学習させる際の選び方．
- コーパス中の単語の出現確率に従ってサンプリングする．

### 初期化 init

In [12]:
corpus = np.array([1, 1, 3, 2, 1, 1, 0, 3, 4, 5, 0, 0])  # コーパスは単語IDのリスト
power = 0.75  # 確率に1未満で累乗し，低頻度の単語に下駄をはかせる
sample_size = 3  # サンプリングする数

# Counterでコーパス中の単語の出現回数をカウントできる
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
print(counts[0])

Counter({1: 4, 0: 3, 3: 2, 2: 1, 4: 1, 5: 1})
3


In [13]:
vocab_size = len(counts)  # 語彙数 = countsの長さ

p = np.zeros(vocab_size)  # 語彙数と同じ要素数の配列で確率を保持する

# 各単語IDの出現回数を格納
for i in range(vocab_size):
    p[i] = counts[i]

# 出現回数を0.75乗して稀な単語の確率に少し下駄をはかせる
word_p = np.power(p, power)
print(f'original p: {p}')
print(f'powerd word_p: {word_p}', end='\n\n')

# np.sum(p) = 単語数 で割って確率にする
p /= np.sum(p)
word_p /= np.sum(word_p)
print(f'p_out: {p}')
print(f'word_p_out: {word_p}')

original p: [3. 4. 1. 2. 1. 1.]
powerd word_p: [2.27950706 2.82842712 1.         1.68179283 1.         1.        ]

p_out: [0.25       0.33333333 0.08333333 0.16666667 0.08333333 0.08333333]
word_p_out: [0.23284685 0.28891787 0.10214789 0.1717916  0.10214789 0.10214789]


### get_negative_sample

In [14]:
# コンテキストとターゲットを作る
window_size = 1
contexts, target = create_contexts_target(corpus, window_size)

In [15]:
batch_size = target.shape[0]

In [16]:
negative_sample = np.zeros((batch_size, sample_size), dtype=np.int32)
print(negative_sample)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [17]:
for i in range(batch_size):
    p = word_p.copy()  # 確率を取得
    target_idx = target[i]  # ターゲットを保持
    p[target_idx] = 0  # ターゲットの確率は0
    p /= p.sum()  # ターゲットを除いて確率を再計算
    negative_sample[i, :] = np.random.choice(vocab_size, size=sample_size, replace=True, p=word_p)

In [18]:
print(negative_sample)

[[0 0 1]
 [1 0 4]
 [3 1 3]
 [3 3 2]
 [3 0 3]
 [4 4 2]
 [1 4 1]
 [0 1 2]
 [5 1 1]
 [4 1 5]]


***
## NegativeSamplingLoss
- EmbeddingDotレイヤ，UnigramSamplerレイヤ，SigmoidWithLossレイヤの組み合わせ
- forwardでは中間層のニューロンとターゲットから損失関数を計算する

### 初期化 init
- EmbeddingDotレイヤに使う重みW，UnigramSamplerで確率計算に使うcorpus，指数powerとsample_sizeを引数として受け取る

In [24]:
print(f'W_out: \n {W_out}', end='\n\n')  # 出力側の単語ベクトルとなる重み．語彙数 x word_vec_size
print(f'corpus: \n {corpus}', end='\n\n')  # corpusは単語IDの配列

W_out: 
 [[0.43108747 0.21280531 0.29782549]
 [0.80190639 0.67634727 0.38809073]
 [0.92393452 0.72890279 0.52477641]
 [0.8280952  0.61218108 0.89187467]
 [0.72089756 0.5200182  0.29476372]
 [0.51307117 0.84545686 0.83864454]
 [0.84172767 0.01887939 0.86639175]
 [0.57131253 0.53254022 0.65238355]
 [0.22800543 0.30880185 0.74129552]
 [0.46369241 0.50884812 0.18668578]]

corpus: 
 [1 1 3 2 1 1 0 3 4 5 0 0]



In [27]:
power = 0.75
sample_size = 5  # 負例として5単語をサンプリング
sampler = UnigramSampler(corpus, power, sample_size)  # UnigramSampler初期化
loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけLossレイヤがいる
embed_dot_layers = [EmbeddingDot(W_out) for _ in range(sample_size + 1)]