# 第11回講義 宿題

## 課題. RNN Encoder-Decoderで日中翻訳のモデルを実装せよ

### 注意

- homework関数を完成させて提出してください
    - 訓練データのtrain_X, train_yのみが与えられます
    - train_Xとtrain_yをtrain_X, valid_xとtrain_y, valid_yに分けるなどしてモデルを学習させてください
    - **test関数を戻り値**としてください (下に書いてあります)
- **test_X, test_yに対する交差エントロピー(負の対数尤度)の平均で評価**します
- 全体の実行時間がiLect上で60分を超えないようにしてください
- homework関数の外には何も書かないでください

次のような内容のコードが事前に実行されます

```python
from __future__ import division
from collections import OrderedDict, Counter
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

def build_vocab(file_path):
    f_vocab, e_vocab = set(), set()
    for line in open(file_path):
        f, e = [l.strip().split()[1:-1] for l in line.split('|||')]
        f_vocab.update(f)
        e_vocab.update(e)
    
    f_w2i = {w: np.int32(i+2) for i, w in enumerate(f_vocab)}
    e_w2i = {w: np.int32(i+2) for i, w in enumerate(e_vocab)}
    
    f_w2i['<s>'], f_w2i['</s>'] = np.int32(0), np.int32(1)
    e_w2i['<s>'], e_w2i['</s>'] = np.int32(0), np.int32(1)
    return set(f_w2i.keys()), set(e_w2i.keys()), f_w2i, e_w2i
    
def encode(sentence, vocab, w2i):
    encoded_sentence = []
    for w in sentence:
        if w in vocab:
            encoded_sentence.append(w2i[w])
        else:
            encoded_sentence.append(w2i['UNK'])
    return encoded_sentence
    
def decode(encoded_sentence, w2i):
    i2w = {i:w for w, i in w2i.items()}
    decoded_sentence = []
    for i in encoded_sentence:
        decoded_sentence.append(i2w[i])
    return decoded_sentence
    
def load_data(file_path, f_vocab, e_vocab, f_w2i, e_w2i):
    x, y = [], []
    for line in open(file_path):
        f, e = [l.strip().split() for l in line.split('|||')]
        f_enc = encode(f, f_vocab, f_w2i)
        e_enc = encode(e, e_vocab, e_w2i)
        x.append(f_enc)
        y.append(e_enc)
    return x, y

global f_vocab
global e_vocab

f_vocab, e_vocab, f_w2i, e_w2i = build_vocab(dataset_path)
train_X, train_y = load_data(dataset_path, f_vocab, e_vocab, f_w2i, e_w2i)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=??) # random_stateはひみつです
```

次のセルのhomework関数を完成させて提出してください
- **上記のコード以外で必要なもの**は全て書いてください

In [1]:
def homework(train_X, train_y):
    import time
    
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)
    
    # helper function for initialization
    def sharedX(X, name=None, dtype="float32"):
        return theano.shared(np.array(X, dtype=dtype), name=name)
    
    # 1. word embedding
    class Projection:
        def __init__(self, in_dim, out_dim, scale):
            self.V = sharedX(rng.randn(in_dim, out_dim) * scale, name='V')
            self.params = [self.V]

        def f_prop(self, x):
            x_emb = self.V[x]
            return x_emb
        
    # 2. LSTM
    class LSTM:
        def __init__(self, in_dim, out_dim, scale, h_0=None, c_0=None):

            #- Input gate
            self.W_xi = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xi')
            self.W_hi = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_hi')
            self.W_ci = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_ci')
            self.b_i  = sharedX(rng.randn(out_dim)*scale, name='b_i')

            #- Forget gate
            self.W_xf = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xf')
            self.W_hf = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_hf')
            self.W_cf = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_cf')
            self.b_f  = sharedX(rng.randn(out_dim)*scale, name='b_f')

            #- Cell state
            self.W_xc = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xc')
            self.W_hc = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_hc')
            self.b_c  = sharedX(rng.randn(out_dim)*scale, name='b_c')

            #- Output gate
            self.W_xo = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xo')
            self.W_ho = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_ho')
            self.W_co = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_co')
            self.b_o  = sharedX(rng.randn(out_dim)*scale, name='b_o')

            #- Initial state
            if h_0 is None:
                self.h_0 = sharedX(np.zeros(out_dim), name='h_0')
            else:
                self.h_0 = h_0
            if c_0 is None:
                self.c_0 = sharedX(np.zeros(out_dim), name='c_0')
            else:
                self.c_0 = c_0

            self.output_info = [self.h_0, self.c_0]
            self.params = [self.W_xf, self.W_hf, self.W_cf, self.b_f
                           , self.W_xi, self.W_hi, self.W_ci, self.b_i
                           , self.W_xc, self.W_hc, self.b_c
                           , self.W_xo, self.W_ho, self.W_co, self.b_o]

        def f_prop(self, x):
            def fn(x, h_tm1, c_tm1):
                # Input gate
                i_t = T.nnet.sigmoid(T.dot(x, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) 
                                     + self.b_i)

                # Forget gate
                f_t = T.nnet.sigmoid(T.dot(x, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) 
                                     + self.b_f)

                # Cell state
                c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c)

                # Output gate
                o_t = T.nnet.sigmoid(T.dot(x, self.W_xo) + T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) 
                                     + self.b_o)

                # Hidden state
                h_t = o_t * T.tanh(c_t)

                return h_t, c_t

            [h,c], _ = theano.scan(fn=fn,
                                 sequences=[x],
                                 outputs_info=self.output_info)

            return h
        
    # 3. FC-layer
    class Linear:
        def __init__(self, in_dim, out_dim, scale):
            self.W_out = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_out')
            self.b_out = sharedX(rng.randn(out_dim,)*scale, name='b_out')
            self.params = [self.W_out, self.b_out]

        def f_prop(self, x):
            z = T.dot(x, self.W_out) + self.b_out
            return z
        
    # 4. activation function
    class Activation:
        def __init__(self, function):
            self.function = function
            self.params = []

        def f_prop(self, x):
            self.z = self.function(x)
            return self.z
        
    # 5. optimization
    def sgd(cost, params, eps=np.float32(0.1)):
        g_params = T.grad(cost, params)
        updates = OrderedDict()
        for param, g_param in zip(params, g_params):
            updates[param] = param - eps*g_param
        return updates
    
    def Adam(params, g_params, lr=0.001, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        i = theano.shared(np.float32(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, g_params):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    # 6. model definition
    x = T.ivector('x')
    t = T.ivector('t')

    # Target
    t_in = t[:-1]
    t_out = t[1:]

    hid_dim = 100

    def f_props(layers, x):
        layer_out = x
        for i, layer in enumerate(layers):
            if i == 0:
                layer_out = layer.f_prop(x)
            else:
                layer_out = layer.f_prop(layer_out)
        return layer_out

    encoder = [
        Projection(len(f_vocab), 500, scale=0.01),
        LSTM(500, hid_dim, 0.01),
    ]

    h_enc = f_props(encoder, x)[-1] # Take the last state of encoder

    decoder = [
        Projection(len(e_vocab), 500, scale=0.01),
        LSTM(500, hid_dim, 0.01, h_0=h_enc),
        Linear(hid_dim, len(e_vocab), 0.01),
        Activation(T.nnet.softmax)
    ]
    
    # 7. compile theano function
    def join(layers):
        params = []
        for layer in layers:
            params += layer.params
        return params

    y = f_props(decoder, t_in)
    cost = T.mean(T.nnet.categorical_crossentropy(y, t_out))

    params = join(encoder + decoder)
    gparams = T.grad(cost, params)
    #updates = sgd(cost, params, 0.1)
    updates = Adam(params, gparams, lr=0.001, b1=0.1, b2=0.001, e=1e-8)

    train = theano.function(inputs=[x, t], outputs=cost, updates=updates)
    valid = theano.function(inputs=[x, t], outputs=cost)
    
    # 8. training loops and validation
    max_epoch = 10
    batch_size = 1000
    import time
    for epoch in xrange(max_epoch):
        start = time.clock()
        train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
        batch_cost = 0
        for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
            train_cost = train(instance_x, instance_y)
            batch_cost += train_cost
            if i % batch_size == 0 and i != 0:
                print "EPOCH:: %i, Iteration %i, Training Cost: %.3f" % (epoch + 1, i, batch_cost / batch_size)
                batch_cost = 0
            #if (i+1)%5000 == 0:
                #break
        print "used time: %.3f" % (time.clock() - start)
        
        valid_cost = 0
        for i, (instance_x, instance_y) in enumerate(zip(valid_X, valid_y)):
            valid_cost += valid(instance_x, instance_y)
        print "EPOCH:: %i, validation cost: %.3f" % (epoch + 1, valid_cost / len(valid_X))

    #- 以下の行はそのままでsubmitしてください (修正しないでください)
    test = theano.function(inputs=[x, t], outputs=cost)
    return test

In [2]:
from __future__ import division
from collections import OrderedDict, Counter
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

def build_vocab(file_path):
    f_vocab, e_vocab = set(), set()
    for line in open(file_path):
        f, e = [l.strip().split()[1:-1] for l in line.split('|||')]
        f_vocab.update(f)
        e_vocab.update(e)
    
    f_w2i = {w: np.int32(i+2) for i, w in enumerate(f_vocab)}
    e_w2i = {w: np.int32(i+2) for i, w in enumerate(e_vocab)}
    
    f_w2i['<s>'], f_w2i['</s>'] = np.int32(0), np.int32(1)
    e_w2i['<s>'], e_w2i['</s>'] = np.int32(0), np.int32(1)
    return set(f_w2i.keys()), set(e_w2i.keys()), f_w2i, e_w2i
    
def encode(sentence, vocab, w2i):
    encoded_sentence = []
    for w in sentence:
        if w in vocab:
            encoded_sentence.append(w2i[w])
        else:
            encoded_sentence.append(w2i['UNK'])
    return encoded_sentence
    
def decode(encoded_sentence, w2i):
    i2w = {i:w for w, i in w2i.items()}
    decoded_sentence = []
    for i in encoded_sentence:
        decoded_sentence.append(i2w[i])
    return decoded_sentence
    
def load_data(file_path, f_vocab, e_vocab, f_w2i, e_w2i):
    x, y = [], []
    for line in open(file_path):
        f, e = [l.strip().split() for l in line.split('|||')]
        f_enc = encode(f, f_vocab, f_w2i)
        e_enc = encode(e, e_vocab, e_w2i)
        x.append(f_enc)
        y.append(e_enc)
    return x, y

def load_dataset(dataset_path):
    global f_vocab
    global e_vocab
    
    f_vocab, e_vocab, f_w2i, e_w2i = build_vocab(dataset_path)
    train_X, train_y = load_data(dataset_path, f_vocab, e_vocab, f_w2i, e_w2i)
    train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)
    
    return train_X, test_X, train_y, test_y
    
def check_homework():
    train_X, test_X, train_y, test_y = load_dataset('./train.zh-en')
    test = homework(train_X, train_y)
    crs_ent_list = [test(ins_x, ins_y) for ins_x, ins_y in zip(test_X, test_y)]

    return np.array(crs_ent_list).mean()

if 'homework' in globals():
    result = check_homework()
    print result
    print "No Error Occured!"

Using gpu device 1: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 4007)


EPOCH:: 1, Iteration 1000, Training Cost: 5.186
EPOCH:: 1, Iteration 2000, Training Cost: 4.457
EPOCH:: 1, Iteration 3000, Training Cost: 4.234
EPOCH:: 1, Iteration 4000, Training Cost: 4.029
EPOCH:: 1, Iteration 5000, Training Cost: 3.922
EPOCH:: 1, Iteration 6000, Training Cost: 3.847
EPOCH:: 1, Iteration 7000, Training Cost: 3.714
EPOCH:: 1, Iteration 8000, Training Cost: 3.678
EPOCH:: 1, Iteration 9000, Training Cost: 3.622
EPOCH:: 1, Iteration 10000, Training Cost: 3.566
EPOCH:: 1, Iteration 11000, Training Cost: 3.544
EPOCH:: 1, Iteration 12000, Training Cost: 3.576
EPOCH:: 1, Iteration 13000, Training Cost: 3.502
EPOCH:: 1, Iteration 14000, Training Cost: 3.456
EPOCH:: 1, Iteration 15000, Training Cost: 3.441
EPOCH:: 1, Iteration 16000, Training Cost: 3.373
EPOCH:: 1, Iteration 17000, Training Cost: 3.385
EPOCH:: 1, Iteration 18000, Training Cost: 3.339
EPOCH:: 1, Iteration 19000, Training Cost: 3.341
EPOCH:: 1, Iteration 20000, Training Cost: 3.314
EPOCH:: 1, Iteration 21000, T