# 第10回講義 演習

In [3]:
from __future__ import division
from collections import OrderedDict
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.cross_validation import train_test_split
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

Using gpu device 0: GRID K520 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 4007)


## 課題1. [Scan](http://deeplearning.net/software/theano/library/scan.html)

Theanoではループのためにfor文ではなく, Scan というものを使います.  
少しややこしいので, 簡単な例を

In [2]:
#- Suppose you have a sequence [1, 2, 3, 4, 5] let's define indentity function with scan
x = T.fvector('x')

def step(x):
    return x

h, _ = theano.scan(fn=step,
                         sequences=x,
                         outputs_info=None)

func = theano.function(inputs=[x], outputs=h)

In [3]:
func(np.array([1, 2, 3, 4, 5], dtype='float32'))

array([ 1.,  2.,  3.,  4.,  5.], dtype=float32)

In [4]:
#- Next we define accumulation function
x = T.fvector('x')

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(fn=step,
                         sequences=x,
                         outputs_info=0.0)

func = theano.function(inputs=[x], outputs=h)

In [5]:
func(np.array([1, 2, 3, 4, 5], dtype='float32'))

array([  1.,   3.,   6.,  10.,  15.], dtype=float32)

In [6]:
#- Let's do the same thing with matrix, accumulation over column
x = T.fmatrix('x')

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(fn=step,
                         sequences=x,
                         outputs_info=np.zeros(5, dtype='float32'))

func = theano.function(inputs=[x], outputs=h)

In [7]:
func(np.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], dtype='float32'))

array([[  1.,   2.,   3.,   4.,   5.],
       [  2.,   4.,   6.,   8.,  10.],
       [  3.,   6.,   9.,  12.,  15.]], dtype=float32)

In [8]:
#- Advanced :: take previous inputs
x = T.fmatrix('x')

def step(x, h_tm1, h_tm2):
    return x + h_tm1 + h_tm2

h, _ = theano.scan(fn=step,
                         sequences=[dict(input=x, taps=[0, -1, -2])],
                         outputs_info=None)

func = theano.function(inputs=[x], outputs=h)

In [9]:
func(np.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], dtype='float32'))

array([[  3.,   6.,   9.,  12.,  15.],
       [  3.,   6.,   9.,  12.,  15.],
       [  3.,   6.,   9.,  12.,  15.]], dtype=float32)

## 課題2. Recurrent Neural Network (RNN) で Part-of-speech (POS) tagging

文が与えられた時, その品詞を予測するRNNを学習します.

word2indexは単語をIDに変換する辞書, tag2indexは品詞をIDに変換する辞書です.  
train_data, valid_dataには文と品詞タグのペアが入っています.  
文の長さと品詞タグの長さは必ず同じです.

encode_datasetを使うと単語と品詞をIDに変換することができます.

### 1. データセットの読み込みと単語・品詞のID化

In [4]:
def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [l.strip().split() for l in line.split('|||')]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([tag2index[tag] for tag in tags])
    return X, y

train_data, train_vocab, train_tags = load_data('train.unk')
special_words = set(['<unk>'])

word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

train_X, train_y = encode_dataset(train_data, word2index, tag2index)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

教師ラベルのone-hot化

In [5]:
classes = np.arange(len(tag2index))
train_y = [label_binarize(instance_y, classes).astype('int32') for instance_y in train_y]

train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

今回の入力は単語のID列（ベクトルx）と品詞のID列（ベクトルy）です.  
Projectionレイヤーを使って, 単語をベクトルに変換します.  
その後, RNNに入力し, その出力値をSoftmax関数を使って確率分布に変換します.  
予測は画像の時とおなじく, 最大の確率を持つクラスを予測とします.

### 1. 単語のEmbedding

In [6]:
def sharedX(X, name='', dtype="float32"):
    return theano.shared(np.array(X, dtype=dtype), name=name)

class Projection:
    def __init__(self, in_dim, out_dim, scale):
        self.V = sharedX(rng.randn(in_dim, out_dim) * scale, name='V')
        self.params = [self.V]

    def f_prop(self, x):
        x_emb = self.V[x]
        return x_emb

### 2. RNN

In [7]:
class RNN:
    def __init__(self, in_dim, hid_dim):
        self.hid_dim = hid_dim

        #- 重みの次元を決める
        self.W_in  = sharedX(rng.randn(in_dim, hid_dim) * np.sqrt(2./in_dim), name='W_in')
        self.W_rec = sharedX(rng.randn(hid_dim, hid_dim) * np.sqrt(2./hid_dim), name='W_rec')
        # consider how to initialize b
        self.b_rec = sharedX(np.zeros(hid_dim), name='b_rec')
        self.h_0   = sharedX(np.zeros(hid_dim), name='h_0')

        self.output_info = [self.h_0]
        self.params = [self.W_in, self.W_rec, self.b_rec]

    def f_prop(self, x):
        def step(x, h_tm1):
            h = T.dot(x, self.W_in) + T.dot(h_tm1, self.W_rec) + self.b_rec
            return h

        #- Scanの方法を考える
        h, _ = theano.scan(fn=step,
                         sequences=[x],
                         outputs_info=self.output_info)
        return h

### 3. 線形層

In [8]:
class Linear:
    def __init__(self, in_dim, out_dim):
        self.W_out = sharedX(rng.randn(in_dim, out_dim) * np.sqrt(2./in_dim), name='W_out')
        self.b_out = sharedX(np.zeros(out_dim), name='b_out')
        self.params = [self.W_out, self.b_out]

    def f_prop(self, x):
        z = T.dot(x, self.W_out) + self.b_out
        return z

### 4. 活性化層

In [9]:
class Activation:
    def __init__(self, function):
        self.function = function
        self.params = []

    def f_prop(self, x):
        self.z = self.function(x)
        return self.z

### 5. 更新則

In [10]:
def sgd(cost, params, eps=np.float32(0.001)):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        #- Advanced Gradient Clipを実装する（必須ではない）
        gnorm = gparam.norm(L=2)
        gparam = 5. / gnorm * gparam
        updates[param] = param - eps*gparam
    return updates

def Adam(params, g_params, lr=0.001, b1=0.1, b2=0.001, e=1e-4):
    updates = []
    i = theano.shared(np.float32(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, g_params):
        #- Advanced Gradient Clipを実装する（必須ではない）
        gnorm = g.norm(L=2)
        g = 5. / gnorm * g
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates

### 6. ネットワークの定義

In [16]:
vocab_size = len(word2index)
hid_dim    = 500
out_dim    = len(tag2index)

x = T.ivector('x')
t = T.imatrix('t')

layers = [
    Projection(vocab_size, 500, scale=0.1),
    RNN(500, hid_dim),
    Activation(T.nnet.sigmoid),
    Linear(hid_dim, out_dim),
    Activation(T.nnet.softmax)
    ]

### 7. train関数とvalid関数とtest関数

In [17]:
params = []
layer_out = x
for i, layer in enumerate(layers):
    params += layer.params
    if i == 0:
        layer_out = layer.f_prop(x)
    else:
        layer_out = layer.f_prop(layer_out)

y = layers[-1].z
cost = T.mean(T.nnet.categorical_crossentropy(y, t))
gparams = T.grad(cost, params)

## Define update graph
updates = Adam(params, gparams, lr=9e-5, e=1e-2) 

## Compile Function
train = theano.function(inputs=[x,t], outputs=cost, updates=updates)
valid = theano.function(inputs=[x,t], outputs=[cost, T.argmax(y, axis=1)])
test  = theano.function(inputs=[x], outputs=T.argmax(y, axis=1))

### 8. 学習

In [None]:
epochs = 5
import time
for epoch in xrange(epochs):
    init_time = time.clock()
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    start = time.clock()
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        cost = train(instance_x, instance_y)
        if i % 1000 == 0:
            print "EPOCH:: %i, Iteration %i, cost: %.3f" % (epoch + 1, i, cost)
            print "used time: %.3f" % (time.clock() - start)
            start = time.clock()
    
    true_y, pred_y, valid_cost = [], [], []
    for instance_x, instance_y in zip(valid_X, valid_y):
        cost, pred = valid(instance_x, instance_y)
        true_y += list(np.argmax(instance_y, axis=1))
        pred_y += list(pred)
        valid_cost += cost
    print 'EPOCH:: %i, Validation cost: %.3f, Validation F1: %.3f' % (epoch + 1, sum(valid_cost), f1_score(true_y, pred_y, average='macro'))
    print 'Time for training one epoch: %.3f' % (time.clock() - init_time)

EPOCH:: 1, Iteration 0, cost: 4.155
used time: 0.070


### 9. テスト

In [27]:
true_y, pred_y = [], []
for instance_x, instance_y in zip(test_X, test_y):
    pred_y += list(test(instance_x))
    true_y += instance_y

f1_score(true_y, pred_y, average='macro')

0.83440532691741265