# small parser

上位千個のデータセットでparserを訓練し、基本的なロジックが成立していることを示す。

In [1]:
import pickle
import re
import sys
import time
import datetime
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import serializers
from chainer import Variable
from chainer import cuda
import gensim
import numpy as np
from chainer import optimizers

Using TensorFlow backend.


## モデル定義

In [2]:
class Parser(chainer.Chain):
    def __init__(self):
        
        def initConfig():
            conf = {"outputDim":3,"midOne":100,"midTwo":50}
            with open("../model/word2id.pkl","rb") as f:
                word2id = pickle.load(f)
                conf["rawInputDim"] = len(word2id)
            with open("../model/simple_act_map.pkl","rb") as f:
                simple_act_map = pickle.load(f)
                conf["simpleActionMap"] = len(simple_act_map)
            with open("../model/word2POS.pkl","rb") as f:
                word2POS = pickle.load(f)
                conf["LenPOS"] = len(set(list(word2POS.values())))
                conf["POSex"] = conf["LenPOS"] // 2
            return conf
    
        confs = initConfig()
        self.raw_input_dim = confs["rawInputDim"]
        self.output_dim = confs["outputDim"]
        self.action_len = confs["simpleActionMap"]
        #self.w2vdim = 300
        self.POS_len, self.POS_ex = confs["LenPOS"], confs["POSex"]
        self.midOne, self.midTwo = confs["midOne"], confs["midTwo"]
        self.bufDim = self.midOne + self.POS_ex
        self.stkDim = self.midOne * 2 + self.action_len
        #self.embedWordPreFix = gensim.models.KeyedVectors.load_word2vec_format(
        #    '../model/GoogleNews-vectors-negative300.bin',binary=True)

        super(Parser, self).__init__(
            #embedWordOfStack = L.EmbedID(self.raw_input_dim, self.midOne),
            embedWordId = L.EmbedID(self.raw_input_dim, self.midOne),
            embedHistoryId = L.EmbedID(self.action_len, self.action_len),
            embedActionId = L.EmbedID(self.action_len, self.action_len),
            embedPOSId = L.EmbedID(self.POS_len, self.POS_ex),
            U = L.Linear(self.stkDim, self.midOne),  # stkInput => lstm
            V = L.Linear(self.bufDim, self.midOne),  # bufInput => lstm
            LS = L.LSTM(self.midOne, self.midTwo),  # for the subtree
            LA = L.LSTM(self.action_len, self.action_len),  # for the action history
            LB = L.LSTM(self.midOne, self.midTwo),  # for the buffer
            W = L.Linear(self.midTwo*2 + self.action_len, self.midTwo), # [St;At;Bt] => classifier
            G = L.Linear(self.midTwo, self.output_dim)  # output
    )


    def minibatchTrains(self,trains):
        """
        param:
        trains:{
                x_i: {
                    his: historyID INT,
                    buf: {
                        w, WordID INT
                        wlm, pre-trained word2vec np.ndarray(dtype=np.float32)
                        t, POS tag ID INT
                        },
                     stk:{
                         h: HEAD pre-trained word2vec np.ndarray(dtype=np.float32)
                         d: DEPENDENT pre-trained word2vec np.ndarray(dtype=np.float32)
                         r: actionID tag INT
                        }
                }
                x_i+1:{
                    his: ...,
                    ...,
                }
            }
        return: minibatch his,buf,stk
        """
        errorcnt = 0
        hiss,bufs,stks = 0,0,0
        for train in trains:
            his, buf, stk = train[0], train[1], train[2]

            # his
            his = self.embedHistoryId(np.asarray([his],dtype=np.int32))
            hiss = F.vstack([hiss,his]) if type(hiss) != int else his

            # buf
            if buf == [-1,-1,-1]:
                buf = np.asarray([0 for i in range(self.bufDim)],dtype=np.float32)
                buf = Variable(buf).reshape(1,self.bufDim)
            else:
                buf = F.concat(
                        (self.embedWordId(np.asarray([buf[0]],dtype=np.int32)),
                        self.embedPOSId(np.asarray([buf[2]],dtype=np.int32))))
            bufs = F.vstack([bufs,buf]) if type(bufs) != int else buf

            # stk
            compose = 0
            for elem in stk[::-1]:
                """
                elem = -1
                のスカラーで回ってくることがある：
                ../auto/preprocessed/13/wsj_1353_21_0000000.pkl周辺で発生
                """
                if type(compose) == int:
                    try:
                        edge = F.concat(
                        (self.embedWordId(np.asarray([elem[0]],dtype=np.int32)),
                        self.embedWordId(np.asarray([elem[1]],dtype=np.int32)),
                        self.embedActionId(np.asarray([elem[2]],dtype=np.int32))))
                    except:
                        sys.stderr.write("---stk loading error---")
                        sys.stderr.write("--- stk := [[-1,-1,0]]")
                        errorcnt += 1
                        edge = F.concat(
                        (self.embedWordId(np.asarray([-1],dtype=np.int32)),
                        self.embedWordId(np.asarray([-1],dtype=np.int32)),
                        self.embedActionId(np.asarray([0],dtype=np.int32))))

                    compose = self.U(edge)
                    compose = F.relu(compose)
                else:
                    edge = F.concat((
                        compose,
                        self.embedWordId(np.asarray([elem[1]],dtype=np.int32)),
                        self.embedActionId(np.asarray([elem[2]],dtype=np.int32))
                    ))
                    compose = self.U(edge)
                    compose = F.relu(compose)

            stks = F.vstack([stks, compose]) if type(stks) != int else compose

        return hiss,bufs,stks

    def reset_state(self):
        self.LS.reset_state()
        self.LA.reset_state()
        self.LB.reset_state()

    def __call__(self, his, buf, stk, label):
        """
        params:
            his: {his}, {his}, {...}
            buf: {w,wlm,t}, {...}, {...}
            stk: {h,d,r}, {...}, {...}
            label: y0,y1,y2 ...
        """
        # apply V
        buf = F.relu(self.V(buf))

        # apply LSTMs
        at = F.relu(self.LA(his))
        st = F.relu(self.LS(stk))
        bt = F.relu(self.LB(buf))

        # final stage
        h1 = F.concat((st, at, bt))
        h2 = F.relu(self.W(h1))
        h3 = F.relu(self.G(h2))
        return F.softmax_cross_entropy(h3,label)


    def pred(self, his, buf, stk):
         # apply V
        buf = F.relu(self.V(buf))

        # apply LSTMs
        at = F.relu(self.LA(his))
        st = F.relu(self.LS(stk))
        bt = F.relu(self.LB(buf))

        # final stage
        h1 = F.concat((st, at, bt))
        h2 = F.relu(self.W(h1))
        h3 = F.relu(self.G(h2))
        return F.argmax(h3, axis=1)

## ヘルパー関数

In [11]:
def composeMatrix(loader,model,test=False):
    """
    loaderナシにする！！
    全てをバッチ/ミニバッチ学習で行う。
    データは全てpandasに抱え込む。抱え込んだ後、良い感じにtrain/testスプリットする。
    """
    try:
        if test == False:
            sentence = loader.gen()
        else:
            sentence = loader.genTestSentence()
    except IndexError:
        print("---loader finished---")
        return 0

    trains = [sentence[i][0] for i in range(len(sentence))]
    labelVec = [sentence[i][1] for i in range(len(sentence))]
    hisMat, bufMat, stkMat = model.minibatchTrains(trains)
    labelVec = Variable(np.asarray(labelVec,dtype=np.int32))

    return [hisMat,bufMat,stkMat,labelVec]

In [9]:
path = "../auto/preprocessed/train/ud-train-9_0000035.pkl"
with open(path,"br") as f:
    test = pickle.load(f)

In [10]:
test

([1,
  [-1, -1, -1],
  [[139, 138, 0], [139, 137, 0], [139, 141, 1], [141, 140, 0], [141, 35, 0]]],
 1)

In [12]:
def backupModel(model,epoch,dirpath="../model/"):
    now = datetime.datetime.now()
    modelName = "../model/parserModel" +"_"+ "ep" + str(epoch) +"_"+ now.strftime('%s') + ".mod"
    serializers.save_hdf5(modelName, model)
    return

In [13]:
def evaluate(model, loader):
    correct, cnt = 0, 0
    predList,goldList = [],[]
    while(1):
        d = composeMatrix(loader,model,test=True)
        if d:
            hisMat, bufMat, stkMat, testVec = d[0],d[1],d[2],d[3]
            predcls = model.pred(hisMat,bufMat,stkMat)
            for pred, test in zip(predcls, testVec):
                predList.append(pred)
                goldList.append(test)
                if pred.data == test.data:
                     correct += 1
                cnt += 1
            model.reset_state()
        else:
            break

    print("correct / cnt:", correct, "/", cnt)
    return np.asarray(predList,dtype=np.int32),np.asarray(goldList,dtype=np.int32)


## main

In [9]:
model = Parser()
optimizer = optimizers.SGD()
optimizer.setup(model)
model.reset_state()
model.cleargrads()