In [1]:
import numpy
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import pickle
import platform

Using TensorFlow backend.


In [2]:
def _parse_data(fh): # 将原始数据按句划分


    split_text = '\n'

    string = fh.read().decode('utf-8')
    data = [[row.split() for row in sample.split(split_text)] for
            sample in
            string.strip().split(split_text + split_text)]
    fh.close()
    return data


In [3]:
train = _parse_data(open('./train_data.data', 'rb'))
test = _parse_data(open('./test_data.data', 'rb'))

In [4]:
#word_counts：每个字及其出现的次数
word_counts = Counter(row[0].lower() for sample in train for row in sample)

In [5]:
#vocab: 词汇表(疑问：为什么是出现两次以上？)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]

In [6]:
#chunk_tags：标签
chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]

In [7]:
#保存词汇表和标签
with open('./config.pkl', 'wb') as outp:
    pickle.dump((vocab, chunk_tags), outp)

In [8]:
#
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None: # 如果最大句长没有定义，则找出最大句长
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))# 为词汇创建索引
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # 得到每句话的词汇索引，默认值为1
    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] # 得到每句话的标签索引

    x = pad_sequences(x, maxlen)  # left padding 序列预处理，不足maxlen的，在前面补0

    y_chunk = pad_sequences(y_chunk, maxlen, value=-1) # 序列预处理，不足maxlen的，在前面补-1

    if onehot:
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk] # one-hot表示
    else:
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_chunk

In [9]:
train = _process_data(train, vocab, chunk_tags)
test = _process_data(test, vocab, chunk_tags)

In [13]:
train_x.shape , train_y.shape

((50658, 100), (50658, 100, 1))

In [14]:
test_x.shape , test_y.shape

((4631, 100), (4631, 100, 1))

In [16]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF
import process_data
import pickle

EMBED_DIM = 200
BiRNN_UNITS = 200

In [17]:
model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
crf = CRF(len(chunk_tags), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         851600    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_1 (CRF)                  (None, None, 7)           1470      
Total params: 1,093,870
Trainable params: 1,093,870
Non-trainable params: 0
_________________________________________________________________




In [18]:
EPOCHS = 10
# train model
model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y])
model.save('./crf_MSRA.h5')

Instructions for updating:
Use tf.cast instead.
Train on 50658 samples, validate on 4631 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
score = model.evaluate(test_x, test_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 7.877739022653835
Test accuracy: 0.980469312685351


In [36]:
maxlen=100 # 最大句长
predict_text = '''中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，
连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'''

In [37]:
word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [word2idx.get(w[0].lower(), 1) for w in predict_text]
length = len(x)
x = pad_sequences([x], maxlen)
model.load_weights('./crf1.h5')
raw = model.predict(x)[0][-length:]
result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in result]

In [40]:
per, loc, org = '', '', ''

for s, t in zip(predict_text, result_tags):
    if t in ('B-PER', 'I-PER'):
        per += ' ' + s if (t == 'B-PER') else s
    if t in ('B-ORG', 'I-ORG'):
        org += ' ' + s if (t == 'B-ORG') else s
    if t in ('B-LOC', 'I-LOC'):
        loc += ' ' + s if (t == 'B-LOC') else s

print(' person:' + per, '\n',
      'location:' + loc, '\n', 
      'organzation:' + org, '\n',)

 person: 周恩来 陈毅 
 location: 埃塞俄比亚 非洲 阿尔巴尼亚 
 organzation: 中华人民共和国国务院 外交部 

