### Scan

Theano ではループのために For 文ではなく、Scan というものを使います　　
少しややこしいので、簡単な例を

In [40]:
##Suppose you have a sequence [1, 2, 3, 4, 5] let's define identity function with scan
x = T.fvector("x")

def step(x):
    return x*x

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=None
                    )
### fn は毎回使う関数
### sequences　は毎回使うxのこと
### output_info は初期値


f = theano.function([x], h)

print f(numpy.array([1, 2, 3, 4, 5]).astype("float32"))

[  1.   4.   9.  16.  25.]


In [41]:
##Next we define accumulation function
x = T.fvector("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=2.0, #Initial value for h
                       #go_backwards=True #you might use it for bi-directional RNNs
                    )

f = theano.function([x], h)

print f(numpy.array([1, 2, 3, 4, 5]).astype("float32"))

[  3.   5.   8.  12.  17.]


In [13]:
## Let's do the same thing with matrix, accumulation over column
x = T.fmatrix("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=numpy.array([0., 0., 0., 0., 0.]) #Initial value for h, it's better to use T.alloc().
                    )

f = theano.function([x], h)

print f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32"))

[[  1.   2.   3.   4.   5.]
 [  2.   4.   6.   8.  10.]
 [  3.   6.   9.  12.  15.]]


In [14]:
## Advanced :: take previous inputs
x = T.fmatrix("x")

def step(x, h_tm1, h_tm2):
    return x + h_tm1 + h_tm2

h, _ = theano.scan(
                       fn=step,
                       sequences=[ dict(input= x, taps = [0, -1, -2])],
                       outputs_info=None #Initial value for h
                    )

f = theano.function([x], h)

print f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5],[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32"))

[[  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]]


### [宿題] POS Tagging

文が与えられた時、その品詞を予測する RNN を学習します。

word2index は単語をIDに変換する辞書、tag2index は品詞をIDに変換する辞書です。  
train_data, dev_data には文と品詞タグのペアが入っています。  
文の長さと品詞タグの長さは必ず同じです。

encode_dataset を使うと単語と品詞をIDに変換することができます。

In [1]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [ l.strip().split() for l in line.split('|||') ]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([ word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([ tag2index[tag] for tag in tags])
    return X, y

train_data, train_vocab, train_tags = load_data('train.unk')
special_words = set(['<unk>'])

word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

Using gpu device 0: GRID K520


In [2]:
train_size = len(train_data)
###print train_data[0]
###print word2index
###print tag2index

train_data, dev_data = train_data[:2000], train_data[2000:2100]

In [4]:
print train_data[0][0]
print train_data[0][1]

print encode_dataset(train_data, word2index, tag2index)[0][0]
print encode_dataset(train_data, word2index, tag2index)[1][0]
print len(encode_dataset(train_data, word2index, tag2index)[0])
print len(encode_dataset(train_data, word2index, tag2index)[1])
print len(train_data)

['In', 'an', 'Oct.', '19', 'review', 'of', '``', 'The', 'Misanthrope', "''", 'at', 'Chicago', "'s", 'Goodman', 'Theatre', '``', 'Revitalized', 'Classics', 'Take', 'the', 'Stage', 'in', 'Windy', 'City', ',', "''", 'Leisure', '&', 'Arts', ',', 'the', 'role', 'of', 'Celimene', ',', 'played', 'by', 'Kim', 'Cattrall', ',', 'was', 'mistakenly', 'attributed', 'to', 'Christina', 'Haag', '.']
['IN', 'DT', 'NNP', 'CD', 'NN', 'IN', '``', 'DT', 'NN', "''", 'IN', 'NNP', 'POS', 'NNP', 'NNP', '``', 'VBN', 'NNS', 'VBP', 'DT', 'NN', 'IN', 'NNP', 'NNP', ',', "''", 'NN', 'CC', 'NNS', ',', 'DT', 'NN', 'IN', 'NNP', ',', 'VBN', 'IN', 'NNP', 'NNP', ',', 'VBD', 'RB', 'VBN', 'TO', 'NNP', 'NNP', '.']
[36601, 10330, 1934, 24227, 36489, 17415, 9885, 13071, 18383, 19160, 10335, 257, 19170, 25434, 16956, 9885, 23272, 30009, 3488, 12011, 26700, 14331, 23198, 37214, 33153, 19160, 37187, 18143, 37598, 33153, 12011, 584, 17415, 24050, 33153, 28607, 10880, 24817, 7739, 33153, 21511, 34465, 16049, 6994, 28274, 34462, 385

In [5]:
for word, tag in zip(train_data[0][0], train_data[0][1]):
    print word, tag
    
test_X  , test_y   = encode_dataset(dev_data,   word2index, tag2index)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In IN
an DT
Oct. NNP
19 CD
review NN
of IN
`` ``
The DT
Misanthrope NN
'' ''
at IN
Chicago NNP
's POS
Goodman NNP
Theatre NNP
`` ``
Revitalized VBN
Classics NNS
Take VBP
the DT
Stage NN
in IN
Windy NNP
City NNP
, ,
'' ''
Leisure NN
& CC
Arts NNS
, ,
the DT
role NN
of IN
Celimene NNP
, ,
played VBN
by IN
Kim NNP
Cattrall NNP
, ,
was VBD
mistakenly RB
attributed VBN
to TO
Christina NNP
Haag NNP
. .


次のセルを完成させて提出してください　　

今回の入力は単語のID列（ベクトル x）と品詞のID列 (ベクトル y)です。  
Projection レイヤーを使って、単語をベクトルに変換します。  
その後、RNN に入力し、その出力値をSotfmax関数を使って確率分布に変換します。  
予測は画像の時とおなじく、最大の確率を持つクラスを予測とします。

In [5]:
train_size = len(train_data)
train_X, train_y = encode_dataset(train_data, word2index, tag2index)
###test_X  , test_y   = encode_dataset(dev_data,   word2index, tag2index)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rng = numpy.random.RandomState(42)
trng = RandomStreams(42)


def sharedX(X, dtype="float32"):
    return theano.shared(numpy.asarray(X, dtype=dtype))


class Activation:
    def __init__(self, func):
        self.func = func
        self.params = []

    def fprop(self, x):
        return self.func(x)


class Projection:
    def __init__(self, in_dim, out_dim, scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.params = [self.W]

    def fprop(self, x):
        h = self.W[x]
        return h
    
    
class Linear:
    def __init__(self, in_dim, out_dim, func,scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.b = sharedX(rng.randn(out_dim,) * scale)
        self.h = None
        self.params = [ self.W, self.b ]
        self.func = func
    def fprop(self, x):
        c = T.dot(x, self.W)+self.b
        h = self.func(c)
        self.h = h
        return h



class RNN:
    def __init__(self, in_dim, hid_dim, func,scale=0.05):
        self.scale = scale
        self.hid_dim = hid_dim
        self.func = func
        ## 重みの次元を決める。
        self.Wx = sharedX(rng.randn(hid_dim,hid_dim ) * scale)
        self.Wh = sharedX(rng.randn(in_dim,hid_dim) * scale)
        self.bh = sharedX(rng.randn(hid_dim) * scale)
        ###self.Wy = sharedX(rng.randn(hid_dim,out_dim ) * scale)
        ###self.by = sharedX(rng.randn(out_dim ) * scale)
        
        ## Initial State をどのように初期化するか
        self.h0 = sharedX(rng.randn(hid_dim) * scale)
        self.output_info = [ self.h0 ]
        self.params = [self.Wh,self.bh,self.Wx]
    
    def fprop(self, x):
        def step(u_t, h_tm1):
            h = self.func(T.dot(h_tm1,self.Wx)+T.dot(u_t,self.Wh)+self.bh)
            ###self.output_info.append(h)
            return h
        ## Scan の方法を考える 
        h, _ = theano.scan(fn=step,sequences=x, outputs_info=self.h0)
        return h
    

def sgd(cost, params, lr):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        ## Advanced Gradient Glip を実装する　（必須ではない）
        #WRITE ME
        updates[param] = param - lr * gparam
    return updates

def prop(layers, x):
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.fprop(x)
        else:
            layer_out = layer.fprop(layer_out)
    return layer_out


def get_params(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params


### build Model + Train
vocab_size = len(word2index)
print vocab_size
hid_dim    = 150
out_dim    = len(tag2index)
in_dim = 400
x, t = T.lvector("x"), T.lvector("t")        
layers =[Projection(vocab_size,in_dim),RNN(in_dim,hid_dim,T.tanh),Linear(hid_dim,out_dim,T.nnet.softmax)]


train_X_pesdo =[i[::-1] for i in train_X]
train_y_pesdo =[i[::-1] for i in train_y]

                                     
###layer =[Projection(in_dim,vocab_size),]
prob = prop(layers, x) 

cost = - T.mean((T.log(prob))[T.arange(x.shape[0]), t])# Loss function を決める　 prop
pred =  T.argmax(prob, axis=1)
##予測した確率から、予測値を決める T.mean

## Collect Parameters
params = get_params(layers) 

## Define update graph
updates = sgd(cost, params, lr=numpy.float32(0.01)) 
updates2 = sgd(cost, params, lr=numpy.float32(0.001))
updates3 = sgd(cost, params, lr=numpy.float32(0.0001))
## Compile Function
train = theano.function([x,t], cost, updates=updates)
train2 = theano.function([x,t], cost, updates=updates2)
train3 = theano.function([x,t], cost, updates=updates3)
valid = theano.function([x,t], [cost, pred])
test  = theano.function([x],pred)

epochs = 10
## Train

train_X = train_X + train_X_pesdo
train_y = train_y + train_y_pesdo
for epoch in range(1):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        if(epoch<=3):
            cost = train(instance_x, instance_y)
        elif(4<=epoch<=7):
            cost = train2(instance_x, instance_y)
        else:
            cost = train3(instance_x, instance_y)
        ###print cost
        if i % 1000 == 0:
            print "EPOCH:: %i, Iteration %i, cost: %.3f"%(epoch+1, i, cost)
    '''
    dev_true, pred_y = [], []
    dev_cost = []
    for i, (instance_x, instance_y) in enumerate(zip(test_X, test_y)):
        cost, pred = valid(instance_x, instance_y)
        pred_y += list(pred) # 予測結果はベクトル
        dev_true += instance_y
        dev_cost.append(cost)
    ###print dev_cost
    print classification_report(dev_true,pred_y)            
    dev_true, pred_y = [], []
    dev_cost = []
    pred_y =[]
    for i, instance_x in enumerate(test_X):
        pred = test(instance_x)
        pred_y = pred_y + list(pred) # 予測結果はベクトル

    a=[]
    for i in test_y:z
        a+=i
    print classification_report(a,pred_y)  
    '''


38055
EPOCH:: 1, Iteration 0, cost: 7.530
EPOCH:: 1, Iteration 1000, cost: 1.100
EPOCH:: 1, Iteration 2000, cost: 1.228
EPOCH:: 1, Iteration 3000, cost: 1.439


In [6]:
train_size = len(train_data)
train_X, train_y = encode_dataset(train_data, word2index, tag2index)
###test_X  , test_y   = encode_dataset(dev_data,   word2index, tag2index)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rng = numpy.random.RandomState(42)
trng = RandomStreams(42)


def sharedX(X, dtype="float32"):
    return theano.shared(numpy.asarray(X, dtype=dtype))


class Activation:
    def __init__(self, func):
        self.func = func
        self.params = []

    def fprop(self, x):
        return self.func(x)


class Projection:
    def __init__(self, in_dim, out_dim, scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.params = [self.W]

    def fprop(self, x):
        h = self.W[x]
        return h
    
    
class Linear:
    def __init__(self, in_dim, out_dim, func,scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.b = sharedX(rng.randn(out_dim,) * scale)
        self.h = None
        self.params = [ self.W, self.b ]
        self.func = func
    def fprop(self, x):
        c = T.dot(x, self.W)+self.b
        h = self.func(c)
        self.h = h
        return h



class RNN:
    def __init__(self, in_dim, hid_dim, func,scale=0.05):
        self.scale = scale
        self.hid_dim = hid_dim
        self.func = func
        ## 重みの次元を決める。
        self.Wx = sharedX(rng.randn(hid_dim,hid_dim ) * scale)
        self.Wh = sharedX(rng.randn(in_dim,hid_dim) * scale)
        self.bh = sharedX(rng.randn(hid_dim) * scale)
        ###self.Wy = sharedX(rng.randn(hid_dim,out_dim ) * scale)
        ###self.by = sharedX(rng.randn(out_dim ) * scale)
        
        ## Initial State をどのように初期化するか
        self.h0 = sharedX(rng.randn(hid_dim) * scale)
        self.output_info = [ self.h0 ]
        self.params = [self.Wh,self.bh,self.Wx]
    
    def fprop(self, x):
        def step(u_t, h_tm1):
            h = self.func(T.dot(h_tm1,self.Wx)+T.dot(u_t,self.Wh)+self.bh)
            ###self.output_info.append(h)
            return h
        ## Scan の方法を考える 
        h, _ = theano.scan(fn=step,sequences=x, outputs_info=self.h0)
        return h
    

def sgd(cost, params, lr):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        ## Advanced Gradient Glip を実装する　（必須ではない）
        #WRITE ME
        updates[param] = param - lr * gparam
    return updates

def prop(layers, x):
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.fprop(x)
        else:
            layer_out = layer.fprop(layer_out)
    return layer_out


def get_params(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params


### build Model + Train
vocab_size = len(word2index)
print vocab_size
hid_dim    = 150
out_dim    = len(tag2index)
in_dim = 400
x, t = T.lvector("x"), T.lvector("t")        
layers =[Projection(vocab_size,in_dim),RNN(in_dim,hid_dim,T.tanh),Linear(hid_dim,out_dim,T.nnet.softmax)]


train_X_pesdo =[i[::-1] for i in train_X]
train_y_pesdo =[i[::-1] for i in train_y]

                                     
###layer =[Projection(in_dim,vocab_size),]
prob = prop(layers, x) 

cost = - T.mean((T.log(prob))[T.arange(x.shape[0]), t])# Loss function を決める　 prop
pred =  T.argmax(prob, axis=1)
##予測した確率から、予測値を決める T.mean

## Collect Parameters
params = get_params(layers) 

## Define update graph
updates = sgd(cost, params, lr=numpy.float32(0.01)) 
updates2 = sgd(cost, params, lr=numpy.float32(0.001))
updates3 = sgd(cost, params, lr=numpy.float32(0.0001))
## Compile Function
train = theano.function([x,t], cost, updates=updates)
train2 = theano.function([x,t], cost, updates=updates2)
train3 = theano.function([x,t], cost, updates=updates3)
valid = theano.function([x,t], [cost, pred])
test  = theano.function([x],pred)

epochs = 3
## Train

train_X = train_X + train_X_pesdo
train_y = train_y + train_y_pesdo
for epoch in range(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        if(epoch<=2):
            cost = train(instance_x, instance_y)
        elif(3<=epoch<=4):
            cost = train2(instance_x, instance_y)
        else:
            cost = train3(instance_x, instance_y)
        ###print cost
        if i % 1000 == 0:
            print "EPOCH:: %i, Iteration %i, cost: %.3f"%(epoch+1, i, cost)
    '''
    dev_true, pred_y = [], []
    dev_cost = []
    for i, (instance_x, instance_y) in enumerate(zip(test_X, test_y)):
        cost, pred = valid(instance_x, instance_y)
        pred_y += list(pred) # 予測結果はベクトル
        dev_true += instance_y
        dev_cost.append(cost)
    ###print dev_cost
    print classification_report(dev_true,pred_y)            
    dev_true, pred_y = [], []
    dev_cost = []
    pred_y =[]
    for i, instance_x in enumerate(test_X):
        pred = test(instance_x)
        pred_y = pred_y + list(pred) # 予測結果はベクトル
    print classification_report(dev_true,pred_y)
    '''

38055
EPOCH:: 1, Iteration 0, cost: 8.075
EPOCH:: 1, Iteration 1000, cost: 1.384
EPOCH:: 1, Iteration 2000, cost: 1.106
EPOCH:: 1, Iteration 3000, cost: 0.846
EPOCH:: 2, Iteration 0, cost: 0.474
EPOCH:: 2, Iteration 1000, cost: 1.464
EPOCH:: 2, Iteration 2000, cost: 0.302
EPOCH:: 2, Iteration 3000, cost: 0.088
EPOCH:: 3, Iteration 0, cost: 0.035
EPOCH:: 3, Iteration 1000, cost: 0.437
EPOCH:: 3, Iteration 2000, cost: 1.019
EPOCH:: 3, Iteration 3000, cost: 0.227


In [7]:
#test_data,  test_vocab, test_tags = load_data('test.unk')
###test_X , test_y  = encode_dataset(test_data,  word2index, tag2index)

test_true, test_pred = [], []
for i, (instance_x, instance_y) in enumerate(zip(test_X, test_y)):
    test_pred += list(test(instance_x))
    test_true += instance_y
        
print f1_score(test_true, test_pred, average='macro')

0.768532088877


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
a=[]
for i in test_y:
    a+=i
print classification_report(a,pred_y)

             precision    recall  f1-score   support

          0       0.96      1.00      0.98        25
          1       0.48      0.38      0.42        37
          2       0.87      0.75      0.80       102
          3       0.75      0.89      0.81        63
          4       1.00      1.00      1.00       118
          5       1.00      1.00      1.00        17
          6       0.93      0.50      0.65        26
          7       0.62      0.62      0.62        45
          8       0.66      0.61      0.63       195
          9       1.00      0.67      0.80         6
         10       0.79      0.70      0.74        43
         11       0.98      0.99      0.99       264
         13       0.80      1.00      0.89         4
         14       1.00      1.00      1.00        14
         15       0.70      0.84      0.76       377
         17       1.00      1.00      1.00        23
         18       1.00      1.00      1.00        99
         19       1.00      1.00      1.00   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [5]:
train_size = len(train_data)
train_X, train_y = encode_dataset(train_data, word2index, tag2index)
###test_X  , test_y   = encode_dataset(dev_data,   word2index, tag2index)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rng = numpy.random.RandomState(42)
trng = RandomStreams(42)


def sharedX(X, dtype="float32"):
    return theano.shared(numpy.asarray(X, dtype=dtype))


class Activation:
    def __init__(self, func):
        self.func = func
        self.params = []

    def fprop(self, x):
        return self.func(x)


class Projection:
    def __init__(self, in_dim, out_dim, scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.params = [self.W]

    def fprop(self, x):
        h = self.W[x]
        return h
    
    
class Linear:
    def __init__(self, in_dim, out_dim, func,scale=0.5):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.b = sharedX(rng.randn(out_dim,) * scale)
        self.h = None
        self.params = [ self.W, self.b ]
        self.func = func
    def fprop(self, x):
        c = T.dot(x, self.W)+self.b
        h = self.func(c)
        self.h = h
        return h



class RNN:
    def __init__(self, in_dim, hid_dim, func,scale=0.05):
        self.scale = scale
        self.hid_dim = hid_dim
        self.func = func
        ## 重みの次元を決める。
        self.Wx = sharedX(rng.randn(hid_dim,hid_dim ) * scale)
        self.Wh = sharedX(rng.randn(in_dim,hid_dim) * scale)
        self.bh = sharedX(rng.randn(hid_dim) * scale)
        ###self.Wy = sharedX(rng.randn(hid_dim,out_dim ) * scale)
        ###self.by = sharedX(rng.randn(out_dim ) * scale)
        
        ## Initial State をどのように初期化するか
        self.h0 = sharedX(rng.randn(hid_dim) * scale)
        self.output_info = [ self.h0 ]
        self.params = [self.Wh,self.bh,self.Wx]
    
    def fprop(self, x):
        def step(u_t, h_tm1):
            h = self.func(T.dot(h_tm1,self.Wx)+T.dot(u_t,self.Wh)+self.bh)
            ###self.output_info.append(h)
            return h
        ## Scan の方法を考える 
        h, _ = theano.scan(fn=step,sequences=x, outputs_info=self.h0)
        return h
    

def sgd(cost, params, lr):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        ## Advanced Gradient Glip を実装する　（必須ではない）
        #WRITE ME
        updates[param] = param - lr * gparam
    return updates

def prop(layers, x):
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.fprop(x)
        else:
            layer_out = layer.fprop(layer_out)
    return layer_out


def get_params(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params


### build Model + Train
vocab_size = len(word2index)
print vocab_size
hid_dim    = 150
out_dim    = len(tag2index)
in_dim = 400
x, t = T.lvector("x"), T.lvector("t")        
layers =[Projection(vocab_size,in_dim),RNN(in_dim,hid_dim,T.tanh),Linear(hid_dim,out_dim,T.nnet.softmax)]


train_X_pesdo =[i[::-1] for i in train_X]
train_y_pesdo =[i[::-1] for i in train_y]

                                     
###layer =[Projection(in_dim,vocab_size),]
prob = prop(layers, x) 

cost = - T.mean((T.log(prob))[T.arange(x.shape[0]), t])# Loss function を決める　 prop
pred =  T.argmax(prob, axis=1)
##予測した確率から、予測値を決める T.mean

## Collect Parameters
params = get_params(layers) 

## Define update graph
updates = sgd(cost, params, lr=numpy.float32(0.01)) 
updates2 = sgd(cost, params, lr=numpy.float32(0.001))
updates3 = sgd(cost, params, lr=numpy.float32(0.0001))
## Compile Function
train = theano.function([x,t], cost, updates=updates)
train2 = theano.function([x,t], cost, updates=updates2)
train3 = theano.function([x,t], cost, updates=updates3)
valid = theano.function([x,t], [cost, pred])
test  = theano.function([x],pred)

epochs = 10
## Train

train_X = train_X + train_X_pesdo
train_y = train_y + train_y_pesdo
for epoch in range(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        if(epoch<=3):
            cost = train(instance_x, instance_y)
        elif(4<=epoch<=7):
            cost = train2(instance_x, instance_y)
        else:
            cost = train3(instance_x, instance_y)
        ###print cost
        if i % 1000 == 0:
            print "EPOCH:: %i, Iteration %i, cost: %.3f"%(epoch+1, i, cost)
    '''
    dev_true, pred_y = [], []
    dev_cost = []
    for i, (instance_x, instance_y) in enumerate(zip(test_X, test_y)):
        cost, pred = valid(instance_x, instance_y)
        pred_y += list(pred) # 予測結果はベクトル
        dev_true += instance_y
        dev_cost.append(cost)
    ###print dev_cost
    print classification_report(dev_true,pred_y)            
    dev_true, pred_y = [], []
    dev_cost = []
    '''
    pred_y =[]
    for i, instance_x in enumerate(test_X):
        pred = test(instance_x)
        pred_y = pred_y + list(pred) # 予測結果はベクトル


38055
EPOCH:: 1, Iteration 0, cost: 7.374
EPOCH:: 1, Iteration 1000, cost: 1.366
EPOCH:: 1, Iteration 2000, cost: 0.971
EPOCH:: 1, Iteration 3000, cost: 1.260
EPOCH:: 2, Iteration 0, cost: 0.492
EPOCH:: 2, Iteration 1000, cost: 0.967
EPOCH:: 2, Iteration 2000, cost: 0.871
EPOCH:: 2, Iteration 3000, cost: 1.205
EPOCH:: 3, Iteration 0, cost: 1.014
EPOCH:: 3, Iteration 1000, cost: 0.521
EPOCH:: 3, Iteration 2000, cost: 0.932
EPOCH:: 3, Iteration 3000, cost: 0.470
EPOCH:: 4, Iteration 0, cost: 0.320
EPOCH:: 4, Iteration 1000, cost: 0.490
EPOCH:: 4, Iteration 2000, cost: 0.829
EPOCH:: 4, Iteration 3000, cost: 0.304
EPOCH:: 5, Iteration 0, cost: 0.209
EPOCH:: 5, Iteration 1000, cost: 0.176
EPOCH:: 5, Iteration 2000, cost: 0.172
EPOCH:: 5, Iteration 3000, cost: 0.673
EPOCH:: 6, Iteration 0, cost: 0.635
EPOCH:: 6, Iteration 1000, cost: 0.356
EPOCH:: 6, Iteration 2000, cost: 0.066
EPOCH:: 6, Iteration 3000, cost: 0.542
EPOCH:: 7, Iteration 0, cost: 0.148
EPOCH:: 7, Iteration 1000, cost: 0.294
E