# 第4回講義 宿題

## 課題. MNISTデータセットを多層パーセプトロン(MLP)で学習せよ

### 注意
- homework関数を完成させて提出してください
    - 訓練データはtrain_X, train_y, テストデータはtest_Xで与えられます
    - train_Xとtrain_yをtrain_X, train_yとvalid_X, valid_yに分けるなどしてモデルを学習させてください
    - test_Xに対して予想ラベルpred_yを作り, homework関数の戻り値としてください\
- pred_yのtest_yに対する精度(F値)で評価します
- 全体の実行時間がiLect上で60分を超えないようにしてください
- homework関数の外には何も書かないでください

- MLPの実装にTheanoなどのライブラリを使わないでください

### ヒント
- 出力yはone-of-k表現
- 最終層の活性化関数はソフトマックス関数, 誤差関数は多クラス交差エントロピー
- 最終層のデルタは教科書参照

次のコードが**事前**に実行されます


```python
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split

import numpy as np

mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))

mnist_X = mnist_X / 255.0

train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2, random_state=??) # random_stateはこちらで与えます
```

次のセルのhomework関数を完成させて提出してください
- パッケージのインポートなど、必要な物はすべて書いてください

In [1]:
def homework(train_X, test_X, train_y):
    """
    Cross validation has been done outside this function, 
    to choose learning rate, initialization scale, number of epochs, etc.
    """

    # basic layers, forward and backward pass
    def affine_forward(x, w, b):
        out = np.dot(x, w) + b
        cache = (x, w, b)
        return out, cache
    def affine_backward(dout, cache):
        x, w, b = cache
        dx, dw, db = None, None, None
        dx = np.dot(dout, w.T)
        dw = np.dot(x.T, dout)
        db = dout.sum(0)
        return dx, dw, db
    def relu_forward(x):
        out = None
        out = np.maximum(0, x)
        cache = x
        return out, cache
    def relu_backward(dout, cache):
        dx, x = None, cache
        dx = dout
        dx[x < 0] = 0
        return dx
    def softmax_loss(x, y):
        probs = np.exp(x - np.max(x, axis=1, keepdims=True))
        probs /= np.sum(probs, axis=1, keepdims=True)
        N = x.shape[0]
        loss = -np.sum(np.log(probs[np.arange(N), y])) / N
        dx = probs.copy()
        dx[np.arange(N), y] -= 1
        dx /= N
        return loss, dx
    def dropout_forward(x, dropout_param):
        p, mode = dropout_param['p'], dropout_param['mode']
        mask = None
        out = None
        if mode == 'train':
            mask = np.random.binomial(1, 1-p, size=x.shape)
            out = x * mask / (1-p)
        elif mode == 'test':
            out = x
        cache = (dropout_param, mask)
        out = out.astype(x.dtype, copy=False)
        return out, cache
    def dropout_backward(dout, cache):
        dropout_param, mask = cache
        mode = dropout_param['mode']
        dx = None
        if mode == 'train':
            dx = dout * mask / (1-dropout_param['p'])
        elif mode == 'test':
            dx = dout
        return dx


    # combination of affine and relu
    def affine_relu_forward(x, w, b):
        a, fc_cache = affine_forward(x, w, b)
        out, relu_cache = relu_forward(a)
        cache = (fc_cache, relu_cache)
        return out, cache

    def affine_relu_backward(dout, cache):
        fc_cache, relu_cache = cache
        da = relu_backward(dout, relu_cache)
        dx, dw, db = affine_backward(da, fc_cache)
        return dx, dw, db

    class FullyConnectedNet(object):
        def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
                   dropout=0, use_batchnorm=False, reg=0.0,
                   weight_scale=1e-2, dtype=np.float32, seed=None):
            self.use_batchnorm = use_batchnorm
            self.use_dropout = dropout > 0
            self.reg = reg
            self.num_layers = 1 + len(hidden_dims)
            self.dtype = dtype
            self.params = {}

    
            dims = [input_dim] + hidden_dims + [num_classes]
            for i in xrange(self.num_layers):
                W_name = 'W' + str(i)
                b_name = 'b' + str(i)
                self.params[W_name] = weight_scale * np.random.randn(dims[i], dims[i+1])
                self.params[b_name] = np.zeros(dims[i+1])
                
            self.dropout_param = {}
            if self.use_dropout:
              self.dropout_param = {'mode': 'train', 'p': dropout}
              if seed is not None:
                self.dropout_param['seed'] = seed

            self.bn_params = []
            if self.use_batchnorm:
              self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]

            # Cast all parameters to the correct datatype
            for k, v in self.params.iteritems():
              self.params[k] = v.astype(dtype)


        def loss(self, X, y=None):
            """
            Compute loss and gradient for the fully-connected net.

            Input / output: Same as TwoLayerNet above.
            """
            X = X.astype(self.dtype)
            mode = 'test' if y is None else 'train'

            # Set train/test mode for batchnorm params and dropout param since they
            # behave differently during training and testing.
            if self.dropout_param is not None:
                self.dropout_param['mode'] = mode   
            if self.use_batchnorm:
                for bn_param in self.bn_params:
                    bn_param[mode] = mode
            scores = None

            caches = []
            out = X
            for i in xrange(self.num_layers):
                W_name = 'W' + str(i)
                b_name = 'b' + str(i)
                if i < self.num_layers-1:
                    out, cache = affine_relu_forward(out, self.params[W_name], self.params[b_name])
                    caches.append(cache)
                else:
                    scores, cache = affine_forward(out, self.params[W_name], self.params[b_name])
                    caches.append(cache)

            # If test mode return early
            if mode == 'test':
                return scores

            loss, grads = 0.0, {}

            loss, d_scores = softmax_loss(scores, y)
            for i in range(self.num_layers)[::-1]:
                W_name = 'W' + str(i)
                b_name = 'b' + str(i)
                loss += 1/2.0 * self.reg * np.sum(self.params[W_name] ** 2)
                if i == self.num_layers - 1:
                    d_out, grads[W_name], grads[b_name] = affine_backward(d_scores, caches[i])
                else:
                    d_out, grads[W_name], grads[b_name] = affine_relu_backward(d_out, caches[i])
                grads[W_name] += self.reg * self.params[W_name]

            return loss, grads


        
    class Solver(object):
        def __init__(self, model, train_X, train_y, test_X, batch_size=100, lr=1e-3, num_epochs=50):
            self.model = model
            self.train_X = train_X
            self.train_y = train_y
            self.test_X = test_X
            self.batch_size = batch_size
            self.loss_history = []
            self.lr = lr
            self.num_epochs = num_epochs
            
        def batch_update(self):
            # Make a minibatch of training data
            num_train = self.train_X.shape[0]
            batch_mask = np.random.choice(num_train, self.batch_size)
            X_batch = self.train_X[batch_mask]
            y_batch = self.train_y[batch_mask]

            # Compute loss and gradient
            loss, grads = self.model.loss(X_batch, y_batch)
            self.loss_history.append(loss)

            # Perform a parameter update
            for p, w in self.model.params.iteritems():
                dw = grads[p]
                self.model.params[p] = w - self.lr * dw
                
        def check_accuracy(self, X, y, num_samples=False, batch_size=100):
            # Maybe subsample the data
            N = X.shape[0]
            if num_samples is not None and N > num_samples:
                mask = np.random.choice(N, num_samples)
                N = num_samples
                X = X[mask]
                y = y[mask]
            # Compute predictions in batches
            num_batches = N // batch_size
            if N % batch_size != 0:
                num_batches += 1
            y_pred = []
            for i in xrange(num_batches):
                start = i * batch_size
                end = (i + 1) * batch_size
                scores = self.model.loss(X[start:end])
                y_pred.append(np.argmax(scores, axis=1))
            y_pred = np.hstack(y_pred)
            acc = np.mean(y_pred == y)

            return acc
        
        def train(self):
            for epoch in xrange(self.num_epochs):
                for i in xrange(self.train_X.shape[0] / self.batch_size):
                    self.batch_update()
                #train_acc = self.check_accuracy(self.train_X, self.train_y, num_samples=1000)
                #print train_acc
            
        def predict(self):
            scores = self.model.loss(self.test_X)
            pred_y = np.argmax(scores, axis=1)
            return pred_y
        
    model = FullyConnectedNet([400, 400],
              weight_scale=1e-1, input_dim=784, reg=1e-3)
    solver = Solver(model, train_X, train_y, test_X, batch_size=100, lr=1e-1, num_epochs=50)
    solver.train()
    pred_y = solver.predict()
    #print pred_y
    
    return pred_y

In [35]:
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split

import numpy as np

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))

    mnist_X = mnist_X / 255.0

    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2, random_state=42)

    return (train_X, test_X, train_y, test_y)

def check_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, test_X, train_y)
    return f1_score(test_y, pred_y, average='macro')

if 'homework' in globals():
    result = check_homework()

    print "No Error Occured!"

epoch:  0
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)




batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
batch x (100, 784)
batch_y (100, 10)
b

KeyboardInterrupt: 

In [4]:
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split

import numpy as np

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))

    mnist_X = mnist_X / 255.0

    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2, random_state=42)

    return (train_X, test_X, train_y, test_y)

def check_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, test_X, train_y)
    return f1_score(test_y, pred_y, average='macro')

if 'homework' in globals():
    result = check_homework()
    print result
    print "No Error Occured!"

0.968010821344
No Error Occured!
