# Chainerを用いた日英翻訳機


- 参考: http://qiita.com/odashi_t/items/a1be7c4964fbea6a116e

In [None]:
%matplotlib inline
import sys
import codecs
import pickle
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import MeCab
mt = MeCab.Tagger('-Owakati')

import six
from sklearn.datasets import fetch_mldata
from chainer import computational_graph as c
from chainer import cuda, Variable, FunctionSet, optimizers
import chainer.functions as F

# Recurrent Neural Network

## 一般的なmodel
$\begin{align}
{\bf h}_n & = \tanh \bigl( W_{xh} \cdot {\bf x}_n + W_{hh} \cdot {\bf h}_{n-1} \bigr), \\
{\bf y}_n & = {\rm softmax} \bigl( W_{hy} \cdot {\bf h}_n \bigr)
\end{align}$

## 今回実装するmodel
$\begin{align}
{\bf i}_n & = \tanh \bigl( W_{xi} \cdot {\bf x}_n \bigr), \\
{\bf p}_n & = {\rm LSTM} \bigl( W_{ip} \cdot {\bf i}_n + W_{pp} \cdot {\bf p}_{n-1} \bigr), \\
{\bf q}_1 & = {\rm LSTM} \bigl( W_{pq} \cdot {\bf p}_{|{\bf w}|} \bigr), \\
{\bf q}_m & = {\rm LSTM} \bigl( W_{yq} \cdot {\bf y}_{m-1} + W_{qq} \cdot {\bf q}_{m-1} \bigr), \\
{\bf j}_m & = \tanh \bigl( W_{qj} \cdot {\bf q}_m \bigr), \\
{\bf y}_m & = {\rm softmax} \bigl( W_{jy} \cdot {\bf j}_m \bigr).
\end{align}
$

## prob. of sentence connection

$ \begin{align}
\log {\rm Pr} \bigl( {\bf w} \bigr) & = \sum_{n=1}^{|{\bf w}|} \log {\rm Pr} \bigl( w_n \ \big| \ w_1, w_2, \cdots, w_{n-1} \bigr) \\
& = \sum_{n=1}^{|{\bf w}|} \log {\bf y}_n\big[ {\rm index} \bigl( w_n \bigr) \big]
\end{align} $


# load data

In [None]:
f=codecs.open('/home/hitoshi/train1000.ja','r','utf_8'); ja_data = f.read(); f.close();
f = open("/home/hitoshi/train1000.en"); en_data = f.read(); f.close();
f=codecs.open('/home/hitoshi/test10.ja','r','utf_8'); ja_test = f.read(); f.close();
f = open("/home/hitoshi/test10.en"); en_test = f.read(); f.close();

en_sentences = [en_sentence.split(" ") for en_sentence in en_data.split("\n")]
en_sentences_test = [en_sentence.split(" ") for en_sentence in en_test.split("\n")]
en_words_set = set(sum(en_sentences,[])).union(set(sum(en_sentences_test,[])))
en_words_set = en_words_set.difference([''])

ja_sentences = [mt.parse(ja_sentence.replace(" ","").encode('utf-8')).decode('utf-8').split(" ")[0:-1] for ja_sentence in ja_data.split("\n")]
ja_sentences_test = [mt.parse(ja_sentence.replace(" ","").encode('utf-8')).decode('utf-8').split(" ")[0:-1] for ja_sentence in ja_test.split("\n")]
ja_words_set = set(sum(ja_sentences,[])).union(set(sum(ja_sentences_test,[])))
ja_words_set = ja_words_set.difference([''])

ja_word_to_id = dict(zip(ja_words_set, range(len(ja_words_set))))
en_word_to_id = dict(zip(en_words_set, range(len(en_words_set))))
id_to_ja_word = dict(zip(range(len(ja_words_set)),ja_words_set))
id_to_en_word = dict(zip(range(len(en_words_set)),en_words_set))

ja_test_sentences = ja_test.split("\n")
en_test_sentences = en_test.split("\n")

In [None]:
SRC_VOCAB_SIZE = len(ja_word_to_id) + 1
SRC_EMBED_SIZE = len(ja_word_to_id) + 1
HIDDEN_SIZE = 100
TRG_VOCAB_SIZE = len(en_word_to_id) + 2
TRG_EMBED_SIZE = len(en_word_to_id) + 2
END_OF_SENTENCE = len(en_word_to_id)

model = FunctionSet(
  w_xi = F.EmbedID(SRC_VOCAB_SIZE, SRC_EMBED_SIZE), # 入力層(one-hot) -> 入力埋め込み層
  w_ip = F.Linear(SRC_EMBED_SIZE, 4 * HIDDEN_SIZE), # 入力埋め込み層 -> 入力隠れ層
  w_pp = F.Linear(HIDDEN_SIZE, 4 * HIDDEN_SIZE), # 入力隠れ層 -> 入力隠れ層
  w_pq = F.Linear(HIDDEN_SIZE, 4 * HIDDEN_SIZE), # 入力隠れ層 -> 出力隠れ層
  w_yq = F.EmbedID(TRG_VOCAB_SIZE, 4 * HIDDEN_SIZE), # 出力層(one-hot) -> 出力隠れ層
  w_qq = F.Linear(HIDDEN_SIZE, 4 * HIDDEN_SIZE), # 出力隠れ層 -> 出力隠れ層
  w_qj = F.Linear(HIDDEN_SIZE, TRG_EMBED_SIZE), # 出力隠れ層 -> 出力埋め込み層
  w_jy = F.Linear(TRG_EMBED_SIZE, TRG_VOCAB_SIZE), # 出力隠れ層 -> 出力隠れ層
)

# src_sentence: 翻訳したい単語列 e.g. ['彼', 'は', '走る']
# trg_sentence: 正解の翻訳を表す単語列 e.g. ['he', 'runs']
def forward(src_sentence, trg_sentence, model, training):
  src_sentence = [ja_word_to_id[word] if word in ja_words_set else SRC_VOCAB_SIZE-1 for word in src_sentence]
  trg_sentence = [en_word_to_id[word] if word in en_words_set else TRG_VOCAB_SIZE-1 for word in trg_sentence] + [END_OF_SENTENCE]
  c = Variable(np.zeros((1, HIDDEN_SIZE), dtype=np.float32))  # Initialization of LSTM inner state
  x = Variable(np.array([END_OF_SENTENCE], dtype=np.int32))  # encoder
  i = F.tanh(model.w_xi(x))
  c, p = F.lstm(c, model.w_ip(i))
  for word in reversed(src_sentence):
    x = Variable(np.array([[word]], dtype=np.int32)) # next input layer
    i = F.tanh(model.w_xi(x))
    c, p = F.lstm(c, model.w_ip(i) + model.w_pp(p))
  c, q = F.lstm(c, model.w_pq(p)) # encoder -> decoder
  if training: # decoder
    accum_loss = np.zeros((), dtype=np.float32)
    for word in trg_sentence:
      j = F.tanh(model.w_qj(q))
      y = model.w_jy(j)
      t = Variable(np.array([word], dtype=np.int32))
      accum_loss = accum_loss + F.softmax_cross_entropy(y, t)
      c, q = F.lstm(c, model.w_yq(t)+ model.w_qq(q))
    return accum_loss
  else:
    # 予測時には翻訳器が生成したyを次回の入力に使い、forwardの結果として生成された単語列を返す。
    # yの中で最大の確率を持つ単語を選択していくが、softmaxを取る必要はない。
    hyp_sentence = []
    while len(hyp_sentence) < 100: # 100単語以上は生成しないようにする
      j = F.tanh(model.w_qj(q))
      y = model.w_jy(j)
      word = y.data.argmax(1)[0]
      if word == END_OF_SENTENCE:
        break # 終端記号が生成されたので終了
      hyp_sentence.append(id_to_en_word[word])
      s_y = Variable(np.array([word], dtype=np.int32))
      c, q = F.lstm(c, model.w_yq(s_y) + model.w_qq(q))
    return hyp_sentence
def train(ja_sentences,en_sentences,model):
  opt = optimizers.SGD(); opt.setup(model); # Setop optimizer >> opt = optimizers.Adam() is also good!
  for (ja_sentence, en_sentence) in zip(ja_sentences,en_sentences):
    opt.zero_grads(); # Initialization of grad.
    accum_loss = forward(ja_sentence,en_sentence, model, training = True) # calc forward
    accum_loss.backward() # calc backprop
    opt.clip_grads(10) # Suppression of big grad.
    opt.update()

In [None]:
for i in range(0,10):
  print i
  train(ja_sentences,en_sentences,model)
  hyp_sentence = forward(ja_test_sentences[0],en_test_sentences[0],model, training = False)
  text = ""
  for w in ja_test_sentences[0]:
    text = text + w
  print "=====問題======",text
  print "=====正解======",en_test_sentences[0]
  print "=====予測======",hyp_sentence

In [None]:
for i in range(0,15):
  print i
  train(ja_sentences,en_sentences,model)
  hyp_sentence = forward(ja_test_sentences[1],en_test_sentences[1],model, training = False)
  text = ""
  for w in ja_test_sentences[1]:
    text = text + w
  print "=====問題======",text
  print "=====正解======",en_test_sentences[1]
  print "=====予測======",hyp_sentence