In [89]:
import pickle
import pdb
import codecs
import re
import sys
import math
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [79]:
with open('../renmindata.pkl', 'rb') as inp:
	word2id = pickle.load(inp)
	id2word = pickle.load(inp)
	tag2id = pickle.load(inp)
	id2tag = pickle.load(inp)
	x_train = pickle.load(inp)
	y_train = pickle.load(inp)
	x_test = pickle.load(inp)
	y_test = pickle.load(inp)
	x_valid = pickle.load(inp)
	y_valid = pickle.load(inp)
print("train len:",len(x_train))
print("valid len:",len(x_valid))
print("test len:",len(x_test))
print("word2id len", len(word2id))
print('Creating the data generator ...')

train len: 24271
valid len: 6068
test len: 7585
word2id len 3917
Creating the data generator ...


In [80]:
y_train = np.expand_dims(y_train , 2)
y_valid = np.expand_dims(y_valid , 2)
y_test = np.expand_dims(y_test , 2)

In [81]:
x_train.shape , y_train.shape

((24271, 60), (24271, 60, 1))

In [82]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF
import process_data
import pickle

EMBED_DIM = 200
BiRNN_UNITS = 200

In [83]:
model = Sequential()
model.add(Embedding(len(word2id), EMBED_DIM, mask_zero=True))  # Random embedding
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
crf = CRF(len(tag2id), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 200)         783400    
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_4 (CRF)                  (None, None, 11)          2354      
Total params: 1,026,554
Trainable params: 1,026,554
Non-trainable params: 0
_________________________________________________________________


In [84]:
EPOCHS = 10
# train model
model.fit(x_train , y_train , batch_size=32 , epochs=EPOCHS , validation_data=[x_valid, y_valid])
model.save('./crf_renmin.h5')

Train on 24271 samples, validate on 6068 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [85]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 14.920771443883133
Test accuracy: 0.9577522348084573


In [86]:
def process_data(data, maxlen=60): # 预测数据处理
    x = [word2id.get(w[0].lower(), 1) for w in data]
    length = len(x)
    x = pad_sequences([x], maxlen, padding='post')  # 右补齐
    return x, length

In [93]:
predict_text = '''中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，
连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'''
str, length = process_data(predict_text)
model.load_weights('./crf2.h5')
raw = model.predict(str)[0][:length]
result = [np.argmax(row) for row in raw]
result_tags = [id2tag[i] for i in result]

nt, nr, ns = '', '', ''

for s, t in zip(predict_text, result_tags):
    if t in ('B_nt', 'M_nt', 'E_nt'):
        nt += ' ' + s if (t == 'B_nt') else s
    if t in ('B_nr', 'M_nr', 'E_nr'):
        nr += ' ' + s if (t == 'B_nr') else s
    if t in ('B_ns', 'M_ns', 'E_ns'):
        ns += ' ' + s if (t == 'B_ns') else s

print(' organzation:' + nt, '\n',
      'person:' + nr, '\n',
      'location:' + ns, '\n',)

 organzation: 中华人民共和国国务院 
 person: 周恩来 陈毅 
 location: 埃塞俄比亚 非洲 阿尔巴尼亚 

