# 第10回講義 宿題

## 課題. RNNを用いてPOS taggingを実装せよ

### 注意

- homework関数を完成させて提出してください
    - 訓練データはtrain_X, train_y, テストデータはtest_Xで与えられます
    - train_Xとtrain_yをtrain_X, train_yとvalid_X, valid_yに分けるなどしてモデルを学習させてください
    - test_Xに対して予想ラベルpred_yを作り, homework関数の戻り値としてください
    - pred_yは1次元のlistとしてください
- pred_yのtest_yに対する精度(F値)で評価します
- 全体の実行時間がiLect上で60分を超えないようにしてください
- homework関数の外には何も書かないでください

次のような内容のコードが事前に実行されます

```python
from __future__ import division
from collections import OrderedDict
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.cross_validation import train_test_split
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [l.strip().split() for l in line.split('|||')]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([tag2index[tag] for tag in tags])
    return X, y

train_data, train_vocab, train_tags = load_data('train.unk')
special_words = set(['<unk>'])

global word2index
global tag2index

word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

train_X, train_y = encode_dataset(train_data, word2index, tag2index)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=??) # random_stateはひみつです

```

次のセルのhomework関数を完成させて提出してください
- **上記のコード以外で必要なもの**は全て書いてください

In [2]:
from __future__ import division
from collections import OrderedDict
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.cross_validation import train_test_split
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [l.strip().split() for l in line.split('|||')]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([tag2index[tag] for tag in tags])
    return X, y

def load_dataset():
    train_data, train_vocab, train_tags = load_data('train.unk')
    special_words = set(['<unk>'])
    
    global word2index
    global tag2index

    word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
    tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

    train_X, train_y = encode_dataset(train_data, word2index, tag2index)
    train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

    return (train_X, test_X, train_y, test_y)

def check_homework():
    train_X, test_X, train_y, test_y = load_dataset()
    pred_y = homework(train_X, test_X, train_y)
    true_y = []
    for instance_y in test_y:
        true_y += instance_y
    return f1_score(true_y, pred_y, average='macro')

if 'homework' in globals():
    result = check_homework()
    print result
    print "No Error Occured!"

Using gpu device 0: GRID K520 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 4007)


In [1]:
def homework(train_X, test_X, train_y):
    # transform to one-hot vector
    classes = np.arange(len(tag2index))
    train_y = [label_binarize(instance_y, classes).astype('int32') for instance_y in train_y]
    # create validation set
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)
    
    # helper function to create weight/bias
    def sharedX(X, name='', dtype="float32"):
        return theano.shared(np.array(X, dtype=dtype), name=name)

    
    class Projection:
        """Word Embedding Layer"""
        def __init__(self, in_dim, out_dim, scale):
            limit = np.sqrt(2./in_dim)
            self.V = sharedX(rng.uniform(low=-limit, high=limit, size=(in_dim, out_dim), name='V')
            self.params = [self.V]

        def f_prop(self, x):
            x_emb = self.V[x]
            return x_emb
        
        
    class RNN:
        """Recurrent Layer"""
        def __init__(self, in_dim, hid_dim, scale):
            self.hid_dim = hid_dim

            # modified initialization
            limit_W_in = np.sqrt(2./in_dim)
            self.W_in  = sharedX(rng.uniform(low=-limit_W_in, high=limit_W_in, size=(in_dim, hid_dim)), name='W_in')
            limit_W_rec = np.sqrt(2./hid_dim)
            self.W_rec = sharedX(rng.uniform(low=-limit_W_rec, high=limit_W_rec, size=(hid_dim, hid_dim)), name='W_rec')
            # consider how to initialize b
            self.b_rec = sharedX(rng.randn(hid_dim) * scale, name='b_rec')
            self.h_0   = sharedX(np.zeros(hid_dim), name='h_0')

            self.output_info = [self.h_0]
            self.params = [self.W_in, self.W_rec, self.b_rec]

        def f_prop(self, x):
            def step(x, h_tm1):
                h = T.dot(x, self.W_in) + T.dot(h_tm1, self.W_rec) + self.b_rec
                return h

            # loop in theano
            h, _ = theano.scan(fn=step,
                             sequences=[x],
                             outputs_info=self.output_info)
            return h
        
        
    class Linear:
        """Fully-connected Layer"""
        def __init__(self, in_dim, out_dim, scale):
            limit = np.sqrt(2./in_dim)
            self.W_out = sharedX(rng.uniform(low=-limit, high=limit, size=(in_dim, out_dim), name='W_out')
            self.b_out = sharedX(rng.randn(out_dim), name='b_out')
            self.params = [self.W_out, self.b_out]

        def f_prop(self, x):
            z = T.dot(x, self.W_out) + self.b_out
            return z
        
        
    class Activation:
        """Apply activation function to previous layers' outputs"""
        def __init__(self, function):
            self.function = function
            self.params = []

        def f_prop(self, x):
            self.z = self.function(x)
            return self.z
        
        
    def Adam(params, g_params, lr=0.001, b1=0.1, b2=0.001, e=1e-8):
        """Adam optimizer with gradient clipping"""
        updates = []
        i = theano.shared(np.float32(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, g_params):
            # consider where to clip
            #gnorm = g.norm(L=2)
            #rescale = T.maximum(5, gnorm)
            #g = g * 5 / rescale
            g = T.clip(g, -1, 1)
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    
    # define the structure of networks
    vocab_size = len(word2index)
    hid_dim    = 400
    out_dim    = len(tag2index)

    x = T.ivector('x')
    t = T.imatrix('t')

    layers = [
        Projection(vocab_size, 500, scale=0.01),
        RNN(500, hid_dim, 0.01),
        Activation(T.tanh),
        Linear(hid_dim, out_dim, 0.01),
        Activation(T.nnet.softmax)
        ]
    
    
    # forward pass
    params = []
    layer_out = x
    for i, layer in enumerate(layers):
        params += layer.params
        if i == 0:
            layer_out = layer.f_prop(x)
        else:
            layer_out = layer.f_prop(layer_out)

    y = layers[-1].z
    cost = T.mean(T.nnet.categorical_crossentropy(y, t))
    # backward pass
    gparams = T.grad(cost, params)

    # Define update graph
    updates = Adam(params, gparams, lr=1e-3, e=1e-3) 

    # Compile Function
    train = theano.function(inputs=[x,t], outputs=cost, updates=updates)
    valid = theano.function(inputs=[x,t], outputs=[cost, T.argmax(y, axis=1)])
    test  = theano.function(inputs=[x], outputs=T.argmax(y, axis=1))
    
    
    # training
    max_epoch = 9
    import time
    for epoch in xrange(max_epoch):
        train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
        start = time.clock()
        for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
            cost = train(instance_x, instance_y)
            if i % 1000 == 0:
                print "EPOCH:: %i, Iteration %i, cost: %.3f" % (epoch + 1, i, cost)
                print "used time: %.3f" % (time.clock() - start)
                start = time.clock()

        true_y, pred_y, valid_cost = [], [], 0
        for instance_x, instance_y in zip(valid_X, valid_y):
            cost, pred = valid(instance_x, instance_y)
            true_y += list(np.argmax(instance_y, axis=1))
            pred_y += list(pred)
            valid_cost += cost
        print 'EPOCH:: %i, Validation cost: %.3f, Validation F1: %.3f' % (epoch + 1, valid_cost/len(valid_X), 
                                                                          f1_score(true_y, pred_y, average='macro'))
    
    # testing
    pred_y = []
    for instance_x in test_X:
        pred_y += list(test(instance_x))

        
    return pred_y

SyntaxError: invalid syntax (<ipython-input-1-853ca6258f22>, line 18)

In [None]:
from __future__ import division
from collections import OrderedDict
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.cross_validation import train_test_split
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [l.strip().split() for l in line.split('|||')]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([tag2index[tag] for tag in tags])
    return X, y

def load_dataset():
    train_data, train_vocab, train_tags = load_data('train.unk')
    special_words = set(['<unk>'])
    
    global word2index
    global tag2index

    word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
    tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

    train_X, train_y = encode_dataset(train_data, word2index, tag2index)
    train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

    return (train_X, test_X, train_y, test_y)

def check_homework():
    train_X, test_X, train_y, test_y = load_dataset()
    pred_y = homework(train_X, test_X, train_y)
    true_y = []
    for instance_y in test_y:
        true_y += instance_y
    return f1_score(true_y, pred_y, average='macro')

if 'homework' in globals():
    result = check_homework()

    print "No Error Occured!"