# 第11回講義 演習

In [None]:
from __future__ import division
from collections import OrderedDict, Counter
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

In [None]:
!tar czvf 

## 課題1. Word2vec

In [None]:
class Corpus:
    def __init__(self, file_path):
        self.file_path = file_path
    
    def __iter__(self):
        for line in open(self.file_path):
            instance = [l.strip().split() for l in line.split('|||')]
            # Return format : ['i', 'have', 'a', 'pen']
            yield instance[0]

In [None]:
# 文を返すイテレータを書く
sentences = Corpus('train.unk')

In [None]:
# sg -> Skipgram, hs -> hierachical softmax (not explained), negative -> negative sample size
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4, sg=1, hs=0, negative=100)

In [None]:
# similarity
model.similarity('woman', 'man')

In [None]:
# model.most_similar(positive=['bank', 'company'], negative=['money'])
model.most_similar(positive=['Morgan'])


## 課題2. Recurrent Neural Network (RNN) Encoder-Decoderモデルで日中翻訳

### 1. データセットの読み込みと単語・品詞のID化

train.zh-enの中身 (中国語の文 ||| 英語の文)
```
<s> 我 能 赶上 去 UNK 饭店 的 巴士 吗 ? </s> ||| <s> can i catch a bus that goes to the hilton hotel ? </s>
<s> 有 去 市里 的 火车 吗 ? </s> ||| <s> is there a train that goes to the city ? </s>
<s> 在 UNK 下面 。 </s> ||| <s> it 's just down the hall . </s>
...
```

In [None]:
def build_vocab(file_path):
    f_vocab, e_vocab = set(), set()
    for line in open(file_path):
        f, e = [l.strip().split()[1:-1] for l in line.split('|||')]
        f_vocab.update(f)
        e_vocab.update(e)
    
    f_w2i = {w: np.int32(i+2) for i, w in enumerate(f_vocab)}
    e_w2i = {w: np.int32(i+2) for i, w in enumerate(e_vocab)}
    
    f_w2i['<s>'], f_w2i['</s>'] = np.int32(0), np.int32(1)
    e_w2i['<s>'], e_w2i['</s>'] = np.int32(0), np.int32(1)
    return set(f_w2i.keys()), set(e_w2i.keys()), f_w2i, e_w2i
    
def encode(sentence, vocab, w2i):
    encoded_sentence = []
    for w in sentence:
        if w in vocab:
            encoded_sentence.append(w2i[w])
        else:
            encoded_sentence.append(w2i['UNK'])
    return encoded_sentence
    
def decode(encoded_sentence, w2i):
    i2w = {i:w for w, i in w2i.items()}
    decoded_sentence = []
    for i in encoded_sentence:
        decoded_sentence.append(i2w[i])
    return decoded_sentence
    
def load_data(file_path, f_vocab, e_vocab, f_w2i, e_w2i):
    x, y = [], []
    for line in open(file_path):
        f, e = [l.strip().split() for l in line.split('|||')]
        f_enc = encode(f, f_vocab, f_w2i)
        e_enc = encode(e, e_vocab, e_w2i)
        x.append(f_enc)
        y.append(e_enc)
    return x, y

f_vocab, e_vocab, f_w2i, e_w2i = build_vocab('./train.zh-en')
train_X, train_y = load_data('./train.zh-en', f_vocab, e_vocab, f_w2i, e_w2i)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

### 2. 単語のembedding

In [None]:
def sharedX(X, name=None, dtype="float32"):
    return theano.shared(np.array(X, dtype=dtype), name=name)

class Projection:
    def __init__(self, in_dim, out_dim, scale):
        self.V = sharedX(rng.randn(in_dim, out_dim)*scale, name='V')
        self.params = [self.V]

    def f_prop(self, x):
        x_emb = self.V[x]
        return x_emb

### 3. Long short-term memory (LSTM)

LSTMの構造はスライド参照

- 入力ゲート: $\hspace{20mm}i_t = \sigma \left( W_{xi} x_t + W_{hi} h_{t-1} + W_{ci} c_{t-1} + b_i \right)$
- 忘却ゲート: $\hspace{20mm}f_t = \sigma \left( W_{xf} x_t + W_{hf} h_{t-1} + W_{cf} c_{t-1} + b_f \right)$  
- セル:　　　 $\hspace{20mm}c_t = f_t c_{t-1} + i_t \tanh \left( W_{xc} x_t + W_{hc} h_{t-1} + b_c \right)$  
- 出力ゲート: $\hspace{20mm}o_t = \sigma \left( W_{xo} x_t + W_{ho} h_{t-1} + W_{co} c_{t} + b_o \right)$  
- 隠れ層: 　　$\hspace{20mm}h_t = o_t\tanh \left( c_t \right)$

In [None]:
class LSTM:
    def __init__(self, in_dim, out_dim, scale, h_0=None, c_0=None):
        
        #- Input gate
        self.W_xi = sharedX(rng.randn(# WRITE ME!)*scale, name='W_xi')
        self.W_hi = sharedX(rng.randn(# WRITE ME!)*scale, name='W_hi')
        self.W_ci = sharedX(rng.randn(# WRITE ME!)*scale, name='W_ci')
        self.b_i  = sharedX(rng.randn(# WRITE ME!)*scale, name='b_i')
        
        #- Forget gate
        self.W_xf = sharedX(rng.randn(# WRITE ME!)*scale, name='W_xf')
        self.W_hf = sharedX(rng.randn(# WRITE ME!)*scale, name='W_hf')
        self.W_cf = sharedX(rng.randn(# WRITE ME!)*scale, name='W_cf')
        self.b_f  = sharedX(rng.randn(# WRITE ME!)*scale, name='b_f')
        
        #- Cell state
        self.W_xc = sharedX(rng.randn(# WRITE ME!)*scale, name='W_xc')
        self.W_hc = sharedX(rng.randn(# WRITE ME!)*scale, name='W_hc')
        self.b_c  = sharedX(rng.randn(# WRITE ME!)*scale, name='b_c')
        
        #- Output gate
        self.W_xo = sharedX(rng.randn(# WRITE ME!)*scale, name='W_xo')
        self.W_ho = sharedX(rng.randn(# WRITE ME!)*scale, name='W_ho')
        self.W_co = sharedX(rng.randn(# WRITE ME!)*scale, name='W_co')
        self.b_o  = sharedX(rng.randn(# WRITE ME!)*scale, name='b_o')

        #- Initial state
        if h_0 is None:
            self.h_0 = sharedX(# WRITE ME!), name='h_0')
        else:
            self.h_0 = h_0
        if c_0 is None:
            self.c_0 = sharedX(# WRITE ME!), name='c_0')
        else:
            self.c_0 = c_0

        self.output_info = [self.h_0, self.c_0]
        self.params = [self.W_xf, self.W_hf, self.W_cf, self.b_f
                       , self.W_xi, self.W_hi, self.W_ci, self.b_i
                       , self.W_xc, self.W_hc, self.b_c
                       , self.W_xo, self.W_ho, self.W_co, self.b_o]
    
    def f_prop(self, x):
        def fn(x, h_tm1, c_tm1):
            # Input gate
            i_t = # WRITE ME!
            
            # Forget gate
            f_t = # WRITE ME!
            
            # Cell state
            c_t = # WRITE ME!
            
            # Output gate
            o_t = # WRITE ME!
            
            # Hidden state
            h_t = # WRITE ME!
            
            return h_t, c_t
        
        [h,c], _ = theano.scan(# WRITE ME!)
        
        return h

### 4. 線形層

In [None]:
class Linear:
    def __init__(self, in_dim, out_dim, scale):
        self.W_out = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_out')
        self.b_out = sharedX(rng.randn(out_dim,)*scale, name='b_out')
        self.params = [self.W_out, self.b_out]

    def f_prop(self, x):
        z = T.dot(x, self.W_out) + self.b_out
        return z

### 5. 活性化層

In [None]:
class Activation:
    def __init__(self, function):
        self.function = function
        self.params = []

    def f_prop(self, x):
        self.z = self.function(x)
        return self.z

### 6. 更新則

In [None]:
def sgd(cost, params, eps=np.float32(0.1)):
    g_params = T.grad(cost, params)
    updates = OrderedDict()
    for param, g_param in zip(params, g_params):
        updates[param] = param - eps*g_param
    return updates

### 7. ネットワークの定義

In [None]:
x = T.ivector('x')
t = T.ivector('t')

# Target
t_in = t[:-1]
t_out = t[1:]

hid_dim = 100
out_dim = 100

def f_props(layers, x):
    layer_out = x
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.f_prop(x)
        else:
            layer_out = layer.f_prop(layer_out)
    return layer_out

encoder = [
    # レイヤー構成を決める
]

h_enc = f_props(encoder, x)[-1] # Take the last state of encoder

decoder = [
    # レイヤー構成を決める
]

### 8. train関数とvalid関数とtest関数

In [None]:
def join(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params

y = f_props(decoder, t_in)
cost = T.mean(T.nnet.categorical_crossentropy(y, t_out))

params = join(encoder + decoder)
updates = sgd(cost, params)

train = theano.function(inputs=[x, t], outputs=cost, updates=updates)
valid = theano.function(inputs=[x, t], outputs=cost)
test  = theano.function(inputs=[x, t], outputs=[cost, T.argmax(y, axis=1)])

### 9. 学習

In [None]:
epochs = 1
for epoch in xrange(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        train_cost = train(instance_x, instance_y)
        if i%100 == 0:
            print "EPOCH:: %i, Iteration %i, Training Cost: %.3f" % (epoch + 1, i, train_cost)
        if (i+1)%5000 == 0:
            break

### 10. テスト

idからwordへの辞書を作成

In [None]:
f_i2w = {value:key for key, value in f_w2i.items()}
e_i2w = {value:key for key, value in e_w2i.items()}

テスト

In [None]:
num = 45
instance_x, instance_y = test_X[num], test_y[num]
test_cost, pred_y = test(instance_x, instance_y)
print "Test Cost: %.3f" % test_cost
print "元の文: %s" % ' '.join([f_i2w[com] for com in instance_y])
print "翻訳文: %s" % ' '.join([e_i2w[com] for com in pred_y])