# 4章コードの補足

In [82]:
import sys
sys.path.append('..')
import numpy as np
import collections
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from common.layers import Embedding, SigmoidWithLoss
from common.util import create_contexts_target, most_similar, analogy
from cbow import CBOW
from negative_sampling_layer import EmbeddingDot, UnigramSampler, NegativeSamplingLoss
from dataset import ptb

# negative_sampling_layer.py

## EmbeddingDot
- CBOWモデルの多値分類を二値分類で近似するときに，中間層->出力層の処理を担うレイヤ．
- 正解単語のEmbedding，その単語ベクトルと中間層の値の内積を実行する．

### 初期化 init
- 引数として重みWを受け取る

In [2]:
W_out = np.random.rand(10, 3)  # 出力層側の重み. 語彙数10, word_vec_size=3の想定
embed = Embedding(W_out)  # Embeddingレイヤを生成
grads = embed.grads  # Embeddingレイヤの勾配を保持
cache = None  # backwardで使う値をfoward時に保持する変数

### forward
- 引数の h は中間層のニューロン，idx は正解単語IDの配列

In [3]:
h = np.random.rand(5, 3)  # 中間層のニューロン. batch_size=5, word_vec_size=3 の想定．
idx = np.array([0, 1, 2, 0, 5])  # 正解の単語ID
print(f'中間層 h: \n {h}')
print(f'正解単語ID idx: \n {idx}')

中間層 h: 
 [[0.82217646 0.54067442 0.05173243]
 [0.16698862 0.89809902 0.58427625]
 [0.568241   0.73566202 0.40047943]
 [0.23597886 0.20087999 0.04115954]
 [0.82927398 0.94082707 0.86763458]]
正解単語ID idx: 
 [0 1 2 0 5]


In [4]:
target_W_out = embed.forward(idx)  # 正解単語の重みのみを抜き出す
print(f'W_out: \n {W_out}')
print(f'target_W_out: \n {target_W_out}')

W_out: 
 [[0.00148998 0.6283421  0.99988542]
 [0.97660643 0.39624079 0.41144376]
 [0.9095247  0.63001992 0.55401669]
 [0.57841941 0.81483218 0.95675048]
 [0.42541412 0.23011426 0.43545684]
 [0.93596477 0.93308896 0.47673628]
 [0.32848788 0.70686374 0.83672024]
 [0.82526638 0.08024615 0.76952678]
 [0.26080114 0.77877569 0.39817015]
 [0.04410811 0.45218073 0.27521224]]
target_W_out: 
 [[0.00148998 0.6283421  0.99988542]
 [0.97660643 0.39624079 0.41144376]
 [0.9095247  0.63001992 0.55401669]
 [0.00148998 0.6283421  0.99988542]
 [0.93596477 0.93308896 0.47673628]]


In [5]:
out = np.sum(target_W_out * h, axis=1)  # 正解単語の重みと中間層の内積計算
print(f'out: \n {out}')

out: 
 [0.39268003 0.75934244 1.20218324 0.16772778 2.06767947]


In [6]:
cache = (h, target_W_out)  # backward用

### backward
- 勾配 dout を受け取る

In [7]:
dout = np.random.rand(*out.shape)
print(f'dout: \n {dout}')

dout: 
 [0.60246215 0.60145433 0.97832741 0.38467313 0.86984746]


In [8]:
h, target_W_out = cache

In [9]:
dout = dout.reshape(dout.shape[0], 1)  # 二次元に変換
print(f'reshaped dout: \n {dout}')

reshaped dout: 
 [[0.60246215]
 [0.60145433]
 [0.97832741]
 [0.38467313]
 [0.86984746]]


In [10]:
dtarget_W_out = dout * h  # 内積の逆伝播
print(f'dtarget_W_out: \n {dtarget_W_out}')

dtarget_W_out: 
 [[0.4953302  0.32573588 0.03116683]
 [0.10043603 0.54016555 0.35141548]
 [0.55592574 0.71971832 0.3918    ]
 [0.09077473 0.07727313 0.01583297]
 [0.72134187 0.81837604 0.75470974]]


In [11]:
print(f'grads: \n {grads}', end='\n\n')
embed.backward(dtarget_W_out)  # Embeddingレイヤの逆伝播．勾配を更新．
print(f'updated grads: \n {grads}')

grads: 
 [array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])]

updated grads: 
 [array([[0.58610492, 0.40300901, 0.0469998 ],
       [0.10043603, 0.54016555, 0.35141548],
       [0.55592574, 0.71971832, 0.3918    ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.72134187, 0.81837604, 0.75470974],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ]])]


***
## UnigramSampler
- CBOWモデルの高速化の後半部分．
- 負例をランダムに抽出して学習させる際の選び方．
- コーパス中の単語の出現確率に従ってサンプリングする．

### 初期化 init

In [12]:
corpus = np.array([0, 1, 2, 3, 4, 5, 1, 0, 3, 4, 5, 0, 6])  # コーパスは単語IDのリスト
power = 0.75  # 確率に1未満で累乗し，低頻度の単語に下駄をはかせる
sample_size = 3  # サンプリングする数

# Counterでコーパス中の単語の出現回数をカウントできる
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
print(counts[0])

Counter({0: 3, 1: 2, 3: 2, 4: 2, 5: 2, 2: 1, 6: 1})
3


In [13]:
vocab_size = len(counts)  # 語彙数 = countsの長さ

p = np.zeros(vocab_size)  # 語彙数と同じ要素数の配列で確率を保持する

# 各単語IDの出現回数を格納
for i in range(vocab_size):
    p[i] = counts[i]

# 出現回数を0.75乗して稀な単語の確率に少し下駄をはかせる
word_p = np.power(p, power)
print(f'original p: {p}')
print(f'powerd word_p: {word_p}', end='\n\n')

# np.sum(p) = 単語数 で割って確率にする
p /= np.sum(p)
word_p /= np.sum(word_p)
print(f'p_out: {p}')
print(f'word_p_out: {word_p}')

original p: [3. 2. 1. 2. 2. 2. 1.]
powerd word_p: [2.27950706 1.68179283 1.         1.68179283 1.68179283 1.68179283
 1.        ]

p_out: [0.23076923 0.15384615 0.07692308 0.15384615 0.15384615 0.15384615
 0.07692308]
word_p_out: [0.20710218 0.15279749 0.09085393 0.15279749 0.15279749 0.15279749
 0.09085393]


### get_negative_sample

In [14]:
# コンテキストとターゲットを作る
window_size = 1
contexts, target = create_contexts_target(corpus, window_size)

In [15]:
batch_size = target.shape[0]

In [16]:
negative_sample = np.zeros((batch_size, sample_size), dtype=np.int32)
print(negative_sample)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [17]:
for i in range(batch_size):
    p = word_p.copy()  # 確率を取得
    target_idx = target[i]  # ターゲットを保持
    p[target_idx] = 0  # ターゲットの確率は0
    p /= p.sum()  # ターゲットを除いて確率を再計算
    negative_sample[i, :] = np.random.choice(vocab_size, size=sample_size, replace=True, p=word_p)

In [18]:
print(negative_sample)

[[1 1 4]
 [0 3 4]
 [0 3 5]
 [0 3 5]
 [1 0 4]
 [3 0 0]
 [4 0 1]
 [4 0 5]
 [0 1 4]
 [0 5 4]
 [0 1 6]]


***
## NegativeSamplingLoss
- EmbeddingDotレイヤ，UnigramSamplerレイヤ，SigmoidWithLossレイヤの組み合わせ
- forwardでは中間層のニューロンとターゲットから損失関数を計算する

### 初期化 init
- EmbeddingDotレイヤに使う重みW，UnigramSamplerで確率計算に使うcorpus，指数powerとsample_sizeを引数として受け取る

In [19]:
print('=== 初期化で使う引数 ===')
print(f'W_out: \n {W_out}', end='\n\n')  # 出力側の単語ベクトルとなる重み．語彙数 x word_vec_size
print(f'corpus: {corpus}', end='\n\n')  # corpusは単語IDの配列
power = 0.75
print(f'power: {power}', end='\n\n')
sample_size = 3
print(f'sample_size: {sample_size}', end='\n\n')

print('=========================')
print(f'contexts: \n {contexts}', end='\n\n')  # contextsは単語IDの二次元配列
print(f'target: \n {target}', end='\n\n')  # targetは単語IDの配列
batch_size = target.shape[0]
h = np.random.rand(batch_size, 3)   # 中間層のニューロン
print(f'h: \n {h}')

=== 初期化で使う引数 ===
W_out: 
 [[0.00148998 0.6283421  0.99988542]
 [0.97660643 0.39624079 0.41144376]
 [0.9095247  0.63001992 0.55401669]
 [0.57841941 0.81483218 0.95675048]
 [0.42541412 0.23011426 0.43545684]
 [0.93596477 0.93308896 0.47673628]
 [0.32848788 0.70686374 0.83672024]
 [0.82526638 0.08024615 0.76952678]
 [0.26080114 0.77877569 0.39817015]
 [0.04410811 0.45218073 0.27521224]]

corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]

power: 0.75

sample_size: 3

contexts: 
 [[0 2]
 [1 3]
 [2 4]
 [3 5]
 [4 1]
 [5 0]
 [1 3]
 [0 4]
 [3 5]
 [4 0]
 [5 6]]

target: 
 [1 2 3 4 5 1 0 3 4 5 0]

h: 
 [[0.84685668 0.72092516 0.9344035 ]
 [0.24708944 0.84804312 0.03429725]
 [0.14894109 0.43675012 0.90935181]
 [0.49722102 0.36995068 0.73048223]
 [0.85934985 0.49575536 0.14605255]
 [0.14829678 0.58799207 0.70057522]
 [0.74568939 0.80730954 0.83528947]
 [0.95127202 0.48539179 0.5365676 ]
 [0.92476386 0.26496706 0.27334077]
 [0.75132238 0.24873723 0.28398542]
 [0.70844142 0.87689463 0.60357869]]


In [20]:
sampler = UnigramSampler(corpus, power, sample_size)  # UnigramSampler初期化
loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要
embed_dot_layers = [EmbeddingDot(W_out) for _ in range(sample_size + 1)]  # 負例 + 1(正例)だけ必要

In [21]:
# embed_dot_layersのパラメータと勾配をリストにまとめる
params, grads = [], []
for layer in embed_dot_layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [22]:
print('重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)', end='\n\n')
print(f'length of params: {len(params)}')
print(f'length of grads : {len(grads)}')

重みと勾配はEmbeddingDotレイヤの数 = 負例 + 1(正例)

length of params: 4
length of grads : 4


### forward

In [23]:
batch_size = target.shape[0]
print(f'target: \n {target}')
print(f'batch_size: \n {batch_size}')

target: 
 [1 2 3 4 5 1 0 3 4 5 0]
batch_size: 
 11


In [24]:
# 負例(targetではない単語)をコーパス中の確率に応じてサンプリングする
negative_sample = sampler.get_negative_sample(target)
print(f'negative_sample: \n {negative_sample}')

negative_sample: 
 [[2 0 3]
 [5 3 6]
 [1 0 6]
 [2 3 0]
 [4 0 2]
 [0 5 2]
 [1 5 2]
 [0 1 5]
 [0 3 6]
 [4 2 6]
 [2 3 1]]


In [25]:
# 正例のフォワード
# embed_dot_layersとloss_layersの0番目を正例のレイヤとしている
score = embed_dot_layers[0].forward(h, target)  # EmbeddingDotレイヤのforward
correct_label = np.ones(batch_size, dtype=np.int32)  # 正例は正解ラベルとして1を渡す
loss = loss_layers[0].forward(score, correct_label)  # SigmoidWithLossレイヤ

In [26]:
# 負例のフォワード
# embed_dot_layersとloss_layersの1番目以降を負例のレイヤとしている
negative_label = np.zeros(batch_size, dtype=np.int32)  # 負例は正解ラベルとして0を渡す
for i in range(sample_size):
    negative_target = negative_sample[:, i]  # サンプルサイズの回数ループ．バッチ処理．
    score = embed_dot_layers[i+1].forward(h, negative_target)  # EmbeddingDotレイヤのforward
    loss += loss_layers[i+1].forward(score, negative_label)  # SigmoidWithLossレイヤのforward

In [27]:
print(f'loss: \n {loss}')

loss: 
 4.542997205249307


### backward

In [28]:
dout = 1

In [29]:
# dhを初期化
dh = 0

# SigmoidWithLossレイヤ->EmbeddingDotレイヤの順にbackward
for l0, l1 in zip(loss_layers, embed_dot_layers):
    dscore = l0.backward(dout)
    dh += l1.backward(dscore)  # hの順伝播はリピートノードなので，逆伝播では足し合わせる
    
print(f'dh: \n {dh}')
print(f'h : \n {h}')

dh: 
 [[0.10039006 0.15288929 0.18511488]
 [0.09386252 0.13985399 0.12838103]
 [0.07063495 0.10046531 0.13340828]
 [0.0896187  0.1349287  0.1577672 ]
 [0.07001426 0.07434188 0.11020016]
 [0.09120379 0.13332064 0.12259665]
 [0.21395567 0.13836856 0.09084342]
 [0.13135079 0.12508738 0.11285017]
 [0.04479538 0.12477201 0.15600041]
 [0.0824505  0.07557974 0.10052784]
 [0.18200571 0.12429364 0.12251918]]
h : 
 [[0.84685668 0.72092516 0.9344035 ]
 [0.24708944 0.84804312 0.03429725]
 [0.14894109 0.43675012 0.90935181]
 [0.49722102 0.36995068 0.73048223]
 [0.85934985 0.49575536 0.14605255]
 [0.14829678 0.58799207 0.70057522]
 [0.74568939 0.80730954 0.83528947]
 [0.95127202 0.48539179 0.5365676 ]
 [0.92476386 0.26496706 0.27334077]
 [0.75132238 0.24873723 0.28398542]
 [0.70844142 0.87689463 0.60357869]]


***
# cbow.py
### CBOW
- 改良版CBOWモデル．
- EmbeddingレイヤとNegativeSamplingLossレイヤを使う．

### 初期化 init

In [30]:
hidden_size = 3  # embeddingした単語ベクトルの次元
window_size = 2  # targetの両側いくつか？

print('===引数===')
print(f'vocab_size: {vocab_size}')
print(f'hidden_size: {hidden_size}')
print(f'corpus: {corpus}')
print(f'window_size: {window_size}')

===引数===
vocab_size: 7
hidden_size: 3
corpus: [0 1 2 3 4 5 1 0 3 4 5 0 6]
window_size: 2


In [31]:
# 重みの初期化
V, H = vocab_size, hidden_size
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(V, H).astype('f')

In [32]:
# レイヤの生成
in_layers = []
for i in range(window_size*2):
    layer = Embedding(W_in)
    in_layers.append(layer)
ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=2)

In [33]:
# 重みと勾配をリストで管理する
layers = in_layers + [ns_loss]
params, grads = [], []
for layer in layers:
    params.append(layer.params)
    grads.append(layer.grads)

In [34]:
# インスタンス変数に単語の分散表現を設定
word_vecs = W_in

### forward

In [35]:
contexts, targets = create_contexts_target(corpus, window_size)
print('contexts')
print(contexts)
print('targets')
print(targets)

contexts
[[0 1 3 4]
 [1 2 4 5]
 [2 3 5 1]
 [3 4 1 0]
 [4 5 0 3]
 [5 1 3 4]
 [1 0 4 5]
 [0 3 5 0]
 [3 4 0 6]]
targets
[2 3 4 5 1 0 3 4 5]


In [36]:
h = 0
for i, layer in enumerate(in_layers):
    h += layer.forward(contexts[:, i])  # Embeddingレイヤの順伝播
h *= 1 / len(in_layers)  # コンテキスト全体の平均を中間層のニューロンとする
loss = ns_loss.forward(h, targets)
print(loss)

2.079440434773763


### backward

In [37]:
print('========= before backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

dout = 1
dout = ns_loss.backward(dout)
dout *= 1 / len(in_layers)  # コンテキスト一つ分の損失にする
for layer in in_layers:
    layer.backward(dout)
    
print('========= after backward =========')
print('grad of ns_loss')
print(grads[-1])

print('grad of in_layer_0')
print(grads[0])

grad of ns_loss
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32), array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of in_layer_0
[array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
grad of ns_loss
[array([[-0.00021695,  0.00056938,  0.00012028],
       [-0.00034354,  0.00038117, -0.00036443],
       [-0.00011008,  0.00057971, -0.00013255],
       [-0.00053953,  0.00043156,  0.00031197],
       [-0.00072975,  0.00023441, -0.00102891],
       [-0.00018531,  0.00103815, -0.00053802],
  

***
# train.py
- 改良版CBOWモデルの学習コード

In [63]:
# ハイパーパラメータ設定
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

In [64]:
# データ読み込み
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)

In [65]:
# モデル生成
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

In [66]:
print(f'model: {type(model)}')
print(f'num of in_layers: {len(model.in_layers)}')  # window_size=5なのでin_layerは10個
print(f'num of params of a in_layer: {len(model.in_layers[0].params)}')  # Embeddingレイヤは重み1個
# NegativeSamplingLossレイヤのEmbeddingDotレイヤに重みが1個
# sample_sizeのdefault値5+1(正例分)
print(f'num of params of ns_loss: {len(model.ns_loss.params)}')
print(f'num of params: {len(model.params)}')  # 計16個のパラメータ

model: <class 'cbow.CBOW'>
num of in_layers: 10
num of params of a in_layer: 1
num of params of ns_loss: 6
num of params: 16


In [68]:
# trainer.fit(contexts, target, max_epoch, batch_size)
# 学習はgpu環境でないとキツイ

***
# eval.py
- CBOWモデルによる単語の分散表現の評価
- 学習済みパラメータを読み込んで使用

In [70]:
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'rb') as f:
    params = pickle.load(f)
    
word_vecs = params['word_vecs']
id_to_word = params['id_to_word']
word_to_id = params['word_to_id']

In [80]:
# 近い単語
querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)


[query] you
 we: 0.6103515625
 someone: 0.59130859375
 i: 0.55419921875
 something: 0.48974609375
 anyone: 0.47314453125

[query] year
 month: 0.71875
 week: 0.65234375
 spring: 0.62744140625
 summer: 0.6259765625
 decade: 0.603515625

[query] car
 luxury: 0.497314453125
 arabia: 0.47802734375
 auto: 0.47119140625
 disk-drive: 0.450927734375
 travel: 0.4091796875

[query] toyota
 ford: 0.55078125
 instrumentation: 0.509765625
 mazda: 0.49365234375
 bethlehem: 0.47509765625
 nissan: 0.474853515625


In [84]:
# アナロジー問題
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs, top=5)


[analogy] king:man = queen:?
 woman: 5.16015625
 veto: 4.9296875
 ounce: 4.69140625
 earthquake: 4.6328125
 successor: 4.609375


In [85]:
analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs, top=5)


[analogy] take:took = go:?
 went: 4.55078125
 points: 4.25
 began: 4.09375
 comes: 3.98046875
 oct.: 3.90625


In [88]:
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs, top=5)


[analogy] car:cars = child:?
 children: 5.21875
 average: 4.7265625
 yield: 4.20703125
 cattle: 4.1875
 priced: 4.1796875


In [87]:
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs, top=5)


[analogy] good:better = bad:?
 more: 6.6484375
 less: 6.0625
 rather: 5.21875
 slower: 4.734375
 greater: 4.671875


In [91]:
analogy('small', 'big', 'short', word_to_id, id_to_word, word_vecs, top=5)


[analogy] small:big = short:?
 ual: 4.87109375
 board: 4.48046875
 trading: 4.44140625
 dow: 4.125
 nasdaq: 3.8203125


In [None]:
analogy('small', 'big', 'short', word_to_id, id_to_word, word_vecs, top=5)