* Neural Network Language Model (NNLM)
* See
    * https://arxiv.org/abs/1509.00685
    * Sec. 5 in https://www.amazon.co.jp/%E6%B7%B1%E5%B1%A4%E5%AD%A6%E7%BF%92%E3%81%AB%E3%82%88%E3%82%8B%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86-%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%97%E3%83%AD%E3%83%95%E3%82%A7%E3%83%83%E3%82%B7%E3%83%A7%E3%83%8A%E3%83%AB%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA-%E5%9D%AA%E4%BA%95-%E7%A5%90%E5%A4%AA/dp/4061529242

In [3]:
import datetime
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L

In [5]:
class NNLM(chainer.Chain):
    
    def __init__(self, vocab_size, embed_size, hidden_size, C):
        
        super(NNLM, self).__init__(
            xe = L.EmbedID(vocab_size, embed_size, ignore_label=-1),
            hh = L.Linear(C * embed_size, hidden_size),
            hy = L.Linear(hidden_size, vocab_size)
        )
        self.C = C
        
    def __call__(self, x):
        
        x = x[-self.C:] # 予測中の文字C個
        es = self.xe(chainer.Variable(np.array([x[0]], dtype="int32"))) # １番目の文字をベクトル化
        
        for i in range(1, len(x)): # 残りのC-1個の文字を順番にベクトル化
            e = self.xe(chainer.Variable(np.array([x[i]], dtype="int32")))
            es = np.hstack((es.data, e.data)) # ベクトル化した予測中の文字を連結していく
            
        # 以降は普通の順伝播
        h = F.tanh(self.hh(chainer.Variable(es)))
        y = self.hy(h)
        return y
    
class NNLM4ENC(chainer.Chain):
    
    def __init__(self, vocab_size, embed_size, C):
        
        super(NNLM4ENC, self).__init__(
            xe = L.EmbedID(vocab_size, embed_size, ignore_label=-1),
            hh = L.Linear(C * embed_size, embed_size)
        )
        self.C = C
    
    def __call__(self, x):
        
        x = x[-self.C:] # 予測中の文字C個
        es = self.xe(chainer.Variable(np.array([x[0]], dtype="int32"))) # １番目の文字をベクトル化
        
        for i in range(1, len(x)): # 残りのC-1個の文字を順番にベクトル化
            
            e = self.xe(chainer.Variable(np.array([x[i]], dtype="int32")))
            es = np.hstack((es.data, e.data)) # ベクトル化した予測中の文字を連結していく
            
        h = F.tanh(self.hh(chainer.Variable(es)))
        return h
    
class ENC(chainer.Chain):
    
    def __init__(self, vocab_size, embed_size, C, Q):
        
        super(ENC, self).__init__(
            nnlm4enc = NNLM4ENC(vocab_size=vocab_size, embed_size=embed_size, C=C), # 予測中の文字を埋め込んで、連結したベクトルを返すところまで
            xe = L.EmbedID(vocab_size, embed_size, ignore_label=-1), # 入力文の方を読み込む
            ey = L.Linear(embed_size, vocab_size)
            
        )
        self.embed_size = embed_size
        self.C = C
        self.Q = Q
    
    def __call__(self, x):
        
        # 入力側の埋め込みベクトルのリストを計算する
        query = x[:-self.C]
        query_vs = [] # 埋め込みベクトルのリスト
        
        for q in query:
            q_v = self.xe(chainer.Variable(np.array([q], dtype="int32")))
            query_vs.append(q_v.data)
            
        query_vs = np.array(query_vs, dtype="float32")
        # 出力側の埋め込みベクトルの連結ベクトルを計算する
        response_v = self.nnlm4enc(x).data # 連結された埋め込みベクトル
        # 2つを使ってAttentionを計算する
        query_vs = query_vs.reshape(len(query), self.embed_size)
        w = F.matmul(query_vs, F.transpose(response_v)) # 内積を計算、これを確率に正規化する
        ws = [] # ウエイトを記録する配列
        sum_w = 0 # 合計ウエイト
        
        for w_ in w:
            w_ = F.exp(w_) # ソフトマックスで正規化する
            ws.append(w_)
            sum_w += w_
            
        prob = [] # 確率を記録する配列
        for w_ in ws:
            w_ /= sum_w # 確率に変換
            prob.append(w_.data)
            
        prob = np.array(prob, dtype="float32")
        # （省略）入力文ベクトルは平均化する
        h = F.matmul(F.transpose(prob), query_vs)
        y = self.ey(h)
        return y
    
class ABS(chainer.Chain):
    
    def __init__(self, vocab_size, embed_size, hidden_size, C, Q):
        
        super(ABS, self).__init__(
            nnlm = NNLM(vocab_size=vocab_size, embed_size=embed_size, hidden_size=hidden_size, C=C),
            enc = ENC(vocab_size=vocab_size, embed_size=embed_size, C=C, Q=Q)
        )
        
    def __call__(self, x, t=None, train=False):
        
        y = self.nnlm(x) + self.enc(x)
        if train:
            t = chainer.Variable(np.array(t, dtype='int32'))
            loss = F.softmax_cross_entropy(y, t) # 正解単語と予測単語を照らし合わせて損失を計算
            return loss # 損失
        else:
            return np.argmax(y.data) # 予測値（次の文字）
        
    def reset(self):
        
        self.zerograds()

In [6]:
train_data = [
    [["ABCD"], ["EFGHI"]]
]
train_data_ids = [
    [[0,1,2,3], [4,5,6,7,8]]
]

EPOCH_NUM = 200
EMBED_SIZE = 10
HIDDEN_SIZE =10
BATCH_SIZE = 5
N = 5
C = 3
Q = 2
vocab_size = 9 # ABCDEFGHI

# x => t
# [0,1,2,3]+[] => 4
# [0,1,2,3]+[4] => 5
# [0,1,2,3]+[4,5] => 6
# [0,1,2,3]+[4,5,6] => 7
# [0,1,2,3]+[5,6,7] => 8

train_x = [
    [0,1,2,3,-1,-1,-1],
    [0,1,2,3,-1,-1,4],
    [0,1,2,3,-1,4,5],
    [0,1,2,3,4,5,6],
    [0,1,2,3,5,6,7]
]
train_t = [ # 予測する出力文の次の文字
    [4],
    [5],
    [6],
    [7],
    [8]
]
train_x = np.array(train_x, dtype="int32")
train_t = np.array(train_t, dtype="int32")

model = ABS(vocab_size=vocab_size, embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, C=C, Q=Q)

In [8]:
st = datetime.datetime.now()
for epoch in range(EPOCH_NUM):
    
    opt = chainer.optimizers.Adam()
    opt.setup(model)
    total_loss = 0
    
    for x, t in zip(train_x, train_t): # オンライン学習
        model.reset()
        loss = model(x=x, t=t, train=True)
        loss.backward()
        loss.unchain_backward()
        total_loss += loss.data
        opt.update()
        #opt.zero_grads()
        
    if (epoch+1)%50 == 0:
        ed = datetime.datetime.now()
        print("epoch:\t{}\ttotal loss:\t{}\ttime:\t{}".format(epoch+1, total_loss, ed-st))
        st = datetime.datetime.now()

epoch:	50	total loss:	5.27961802482605	time:	0:00:01.301282
epoch:	100	total loss:	3.247352123260498	time:	0:00:01.374284
epoch:	150	total loss:	1.960340142250061	time:	0:00:01.375270
epoch:	200	total loss:	1.1147116422653198	time:	0:00:01.375921


In [9]:
for x in train_x:
    y = model(x=x)
    print(x, y)

[ 0  1  2  3 -1 -1 -1] 4
[ 0  1  2  3 -1 -1  4] 5
[ 0  1  2  3 -1  4  5] 6
[0 1 2 3 4 5 6] 7
[0 1 2 3 5 6 7] 8
