[简明条件随机场CRF介绍（附带纯Keras实现）](https://spaces.ac.cn/archives/5542/comment-page-1)

In [None]:
### 结构化学习


In [None]:
CRF层本质上是一个带训练参数的loss计算层，因此CRF层只用来训练模型，


而预测则需要另外建立模型。???

In [None]:
from keras.layers import Layer
import keras.backend as K

class CRF(Layer):
    """纯Keras实现CRF层
    CRF层本质上是一个带训练参数的loss计算层，因此CRF层只用来训练模型，
    而预测则需要另外建立模型。???
    """
    def __init__(self, ignore_last_label=False, **kwargs):
        """ignore_last_label：定义要不要忽略最后一个标签，起到mask的效果
        """
        self.ignore_last_label = 1 if ignore_last_label else 0
        super(CRF, self).__init__(**kwargs)
    def build(self, input_shape):
        self.num_labels = input_shape[-1] - self.ignore_last_label
        self.trans = self.add_weight(name='crf_trans',
                                     shape=(self.num_labels, self.num_labels),
                                     initializer='glorot_uniform',
                                     trainable=True)
    def log_norm_step(self, inputs, states):
        """递归计算归一化因子
        要点：1、递归计算；2、用logsumexp避免溢出。
        技巧：通过expand_dims来对齐张量。
        """
        states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1)
        trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim)
        output = K.logsumexp(states+trans, 1) # (batch_size, output_dim)
        return output+inputs, [output+inputs]
    def path_score(self, inputs, labels):
        """计算目标路径的相对概率（还没有归一化）
        要点：逐标签得分，加上转移概率得分。
        技巧：用“预测”点乘“目标”的方法抽取出目标路径的得分。
        """
        point_score = K.sum(K.sum(inputs*labels, 2), 1, keepdims=True) # 逐标签得分
        labels1 = K.expand_dims(labels[:, :-1], 3)
        labels2 = K.expand_dims(labels[:, 1:], 2)
        labels = labels1 * labels2 # 两个错位labels，负责从转移矩阵中抽取目标转移得分
        trans = K.expand_dims(K.expand_dims(self.trans, 0), 0)
        trans_score = K.sum(K.sum(trans*labels, [2,3]), 1, keepdims=True)
        return point_score+trans_score # 两部分得分之和


    def call(self, inputs): # CRF本身不改变输出，它只是一个loss
        return inputs

    def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式
        mask = 1-y_true[:,1:,-1] if self.ignore_last_label else None
        y_true,y_pred = y_true[:,:,:self.num_labels],y_pred[:,:,:self.num_labels]
        init_states = [y_pred[:,0]] # 初始状态
        log_norm,_,_ = K.rnn(self.log_norm_step, y_pred[:,1:], init_states, mask=mask) # 计算Z向量（对数）
        log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z（对数）
        path_score = self.path_score(y_pred, y_true) # 计算分子（对数）
        return log_norm - path_score # 即log(分子/分母)

    def accuracy(self, y_true, y_pred): # 训练过程中显示逐帧准确率的函数，排除了mask的影响
        mask = 1-y_true[:,:,-1] if self.ignore_last_label else None
        y_true,y_pred = y_true[:,:,:self.num_labels],y_pred[:,:,:self.num_labels]
        isequal = K.equal(K.argmax(y_true, 2), K.argmax(y_pred, 2))
        isequal = K.cast(isequal, 'float32')
        if mask == None:
            return K.mean(isequal)
        else:
            return K.sum(isequal*mask) / K.sum(mask)

### CRF-Keras
- input: num_sample,time_step,num_fea 
- output:num_sample,time_step,one-hot_label

$$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
\sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
    - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
    
where:

$Z$: normalization constant

$x_k, y_k$:  inputs and outputs

In [18]:
import numpy as np
xs = [
                [[2,3], [3,4], [4,5], [5,6], [5,7]],
               [[2,3], [3,4], [4,5], [5,6], [5,7]]
              ]
               
xs = list(xs)
# xs = xs[:, :, np.newaxis]
xs

[array([[2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [5, 7]]), array([[2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [5, 7]])]

In [13]:
ys = np.array([
    [[1,0],[1,0],[1,0],[1,0],[1,0]],
    [[1,0],[1,0],[1,0],[1,0],[1,0]]
])
ys

array([[[1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0]],

       [[1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0]]])

In [17]:
from keras.preprocessing.sequence import pad_sequences
x = pad_sequences(xs, maxlen=6) 
x

array([[[0, 0],
        [2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [5, 7]],

       [[0, 0],
        [2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [5, 7]]], dtype=int32)

In [5]:
xs.shape  # num_sample,time_step,num_fea

(2, 5, 2)

In [6]:
y = [[1,1,2,3]]
y_chunk = pad_sequences(y, maxlen=6, value=-1)
y_chunk

array([[-1, -1,  1,  1,  2,  3]], dtype=int32)

In [7]:
from keras.models import Sequential
from keras_contrib.layers import CRF
model = Sequential()
# model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
#     model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) # 
#     model.add(TimeDistributed(Dense(len(chunk_tags))))

 # indicating if provided labels are one-hot(false) or indices(true) (with shape 1 at dim 3) 
crf = CRF(2, sparse_target=False) 
model.add(crf)
#     model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
# model.compile('adam', loss=losses.crf_loss, metrics=[metrics.crf_accuracy])







In [8]:
history = model.fit(xs, ys,batch_size=2,epochs=1)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/1


In [9]:
# model.fit(xs,res)

model.predict(xs).shape

(2, 5, 2)

In [10]:
model.evaluate(xs,ys)



[0.09824888408184052, 1.0]

In [23]:
chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
len(chunk_tags)

7

In [11]:
from keras_contrib import losses,metrics
metrics.crf_accuracy?

[0;31mSignature:[0m [0mmetrics[0m[0;34m.[0m[0mcrf_accuracy[0m[0;34m([0m[0my_true[0m[0;34m,[0m [0my_pred[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m Ge default accuracy based on CRF `test_mode`.
[0;31mFile:[0m      /usr/local/lib/python3.6/dist-packages/keras_contrib-2.0.8-py3.6.egg/keras_contrib/metrics/crf_accuracies.py
[0;31mType:[0m      function


### LSTM+CRF

In [31]:

from keras.models import Sequential
from keras_contrib.layers import CRF
from keras.layers import Embedding, Bidirectional, LSTM
lstm_crf_model = Sequential()
# model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
lstm_crf_model.add(Bidirectional(LSTM(100 // 2, return_sequences=True))) # 
# lstm_crf_model.add(TimeDistributed(Dense(len(chunk_tags))))
crf = CRF(len(chunk_tags), sparse_target=True) 
lstm_crf_model.add(crf)
#     model.summary()
lstm_crf_model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

In [32]:
lstm_crf_model.predict(xs)

array([[[0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.]],

       [[0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.]]], dtype=float32)