In [1]:
from collections import Counter
import pickle
from keras_preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential,load_model
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras_contrib.layers.crf import CRF

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# 获取词典映射
def getWordAndTagId(filePath):
    with open(filePath, encoding='utf-8') as file:
        wordsAndtags = [line.split(' ') for line in file]

        words = [i[0].strip() for i in wordsAndtags if len(i)==2 ]

        tags = [i[1].strip()  for i in wordsAndtags if len(i)==2 ]
        words = Counter(words)
        tags = Counter(tags)
        words = sorted(words.items(), key=lambda x: -x[1])
        words = words[:4000]
        tags = sorted(tags.items(), key=lambda x: -x[1])
        word_size = len(words)
        word2id = {count[0]: index for index, count in enumerate(words,start=1)}
        id2word = {index: count[0] for index, count in enumerate(words,start=1)}
        tag2id = {count[0]: index for index, count in enumerate(tags)}
        id2tag = {index: count[0] for index, count in enumerate(tags)}
        word2id['<PAD>'] = 0
        word2id['<UNK>'] = word_size + 1
        return word2id, tag2id, id2word, id2tag
word2id, tag2id, id2word, id2tag = getWordAndTagId('train.txt')

def saveWordAndTagId(word2id,tag2id):
    word2idFile = open('data/word2id', 'wb')
    tag2idFile = open('data/tag2id', 'wb')
    pickle.dump(word2id, word2idFile)
    pickle.dump(tag2id, tag2idFile)
    word2idFile.close()
    tag2idFile.close()
#这句被注释掉的代码，是将其保存起来，以便下次使用
#saveWordAndTagId(word2id, tag2id)


def getSentencesAndTags(filePath):
    '''
    从文件里面获取句子和标注
    :param filePath:
    :return:
    '''
    with open(filePath,encoding='utf-8') as file:
        wordsAndtags=[line.split() for line in file]
        sentences=[]
        tags=[]
        sentence=[]
        tag=[]
        for wordAndTag in wordsAndtags:
            if len(wordAndTag)==2:
                sentence.append(wordAndTag[0])
                tag.append(wordAndTag[1])
            else:
                sentences.append(sentence)
                tags.append(tag)
                sentence=[]
                tag = []
    return sentences,tags
# 获取句子和标注
sentences, tags=getSentencesAndTags('train.txt')

In [4]:
def sentencesAndTags2id(sentences,tags,word2id, tag2id):
    '''
    将句子和标注转换为id
    :param sentences:
    :param tags:
    :param word2id:
    :param tag2id:
    :return:
    '''
    sentencesIds = [[word2id.get(char,len(word2id)) for char in sentence] for sentence in sentences]
    tagsIds = [[tag2id[char] for char in tag] for tag in tags]
    return sentencesIds,tagsIds
# 将句子和标注转换为id
sentencesIds, tagsIds = sentencesAndTags2id(sentences, tags,word2id, tag2id)


In [5]:
# 将句子和标注进行填充，确保输入维度一致
sentencesIds = pad_sequences(sentencesIds, padding='post')
tagsIds = pad_sequences(tagsIds, padding='post')
print(sentencesIds.shape)
print(tagsIds.shape)

(70, 91)
(70, 91)


In [6]:



def loadWordAndTagId():
    word2idFile = open('data/word2id', 'wb')
    tag2idFile = open('data/tag2id', 'wb')
    word2id = pickle.load(word2idFile)
    tag2id = pickle.load(tag2idFile)
    return word2id, tag2id

def model(vocabSize,embeddingDim,inputLength,tagSize):
    model = Sequential()
    model.add(Embedding(vocabSize + 1,embeddingDim,input_length=inputLength,trainable=False,mask_zero=True))
    model.add(Bidirectional(LSTM(50, return_sequences=True)))
    model.add(TimeDistributed(Dense(tagSize)))
    crf_layer = CRF(tagSize, sparse_target=True)
    model.add(crf_layer)
    model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
    model.summary()
    return model


def get_triple(y_labels,input_data):
    subjects, predicates, objects = '', '', ''
    for s, t in zip(input_data, y_labels):
        if t in ('B-SUBJECT', 'I-SUBJECT'):
            subjects += ' ' + s if (t == 'B-SUBJECT') else s
        if t in ('B-PREDICATE', 'I-PREDICATE'):
            predicates += ' ' + s if (t == 'B-PREDICATE') else s
        if t in ('B-OBJECT', 'I-OBJECT'):
            objects += ' ' + s if (t == 'B-OBJECT') else s
    y_counter = Counter(y_labels)
    # 主 谓 宾、宾..
    if y_counter['B-SUBJECT'] == 1 and y_counter['B-PREDICATE'] == 1 and y_counter['B-OBJECT'] >= 1:
        for object in objects.strip().split(' '):
            print('抽取结果：',subjects+'-'+predicates+'->'+object)
    # 主 谓 宾、谓 宾..
    elif y_counter['B-SUBJECT'] == 1 and y_counter['B-PREDICATE'] > 1 and y_counter['B-OBJECT'] >= 1:
        for i in range(len(predicates.strip().split(' '))):
            print('抽取结果：',subjects+'-'+predicates.strip().split(' ')[i]+'->'+objects.strip().split(' ')[i])
    elif y_counter['B-SUBJECT'] > 1:
        get_triple(y_labels[y_labels.index('I-OBJECT'):],input_data[y_labels.index('I-OBJECT'):])


def predict(model,input_data,length,word2id,id2tag):
    '''
    预测
    :param model:
    :param inputData:
    :param length:
    :param word2id:
    :param id2tag:
    :return:
    '''
    input = [word2id.get(char,len(word2id)) for char in input_data]
    input = np.reshape(input,[1,-1])
    # 填充
    input = pad_sequences(input,maxlen=length)
    y = model.predict(input)
    # 输出为三维  转为二维
    y = y.reshape([-1,7])
    y = np.argmax(y,axis=1)
    # 去除填充的部分
    y = y[len(y)-len(input_data):]
    y = [id2tag[i] for i in y]
    get_triple(y,input_data)
    # print(y)

In [7]:
# 载入模型
model=model(len(word2id),100,sentencesIds.shape[1],len(tag2id))




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 91, 100)           60000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 91, 100)           60400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 91, 7)             707       
_________________________________________________________________
crf_1 (CRF)                  (None, 91, 7)             119       
Total params: 121,226
Trainable params: 61,226
Non-trainable params: 60,000
_________________________________________________________________


In [6]:
"""
下面这两行是做700个epoch训练模型并保存，如果只调用的话需要将其注释掉
"""
history = model.fit(sentencesIds, tagsIds.reshape([len(tagsIds),-1,1]), epochs=700)
model.save('kg-mode.model')




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 91, 100)           60000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 91, 100)           60400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 91, 7)             707       
_________________________________________________________________
crf_1 (CRF)                  (None, 91, 7)             119       
Total params: 121,226
Trainable params: 61,226
Non-trainable params: 60,000
_________________________________________________________________

Epoch 1/700


2022-09-27 14:27:54.893786: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-09-27 14:27:54.902943: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2399895000 Hz
2022-09-27 14:27:54.906743: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55e18fdcc1d0 executing computations on platform Host. Devices:
2022-09-27 14:27:54.906775: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


Epoch 2/700
Epoch 3/700
Epoch 4/700
Epoch 5/700
Epoch 6/700
Epoch 7/700
Epoch 8/700
Epoch 9/700
Epoch 10/700
Epoch 11/700
Epoch 12/700
Epoch 13/700
Epoch 14/700
Epoch 15/700
Epoch 16/700
Epoch 17/700
Epoch 18/700
Epoch 19/700
Epoch 20/700
Epoch 21/700
Epoch 22/700
Epoch 23/700
Epoch 24/700
Epoch 25/700
Epoch 26/700
Epoch 27/700
Epoch 28/700
Epoch 29/700
Epoch 30/700
Epoch 31/700
Epoch 32/700
Epoch 33/700
Epoch 34/700
Epoch 35/700
Epoch 36/700
Epoch 37/700
Epoch 38/700
Epoch 39/700
Epoch 40/700
Epoch 41/700
Epoch 42/700
Epoch 43/700
Epoch 44/700
Epoch 45/700
Epoch 46/700
Epoch 47/700
Epoch 48/700
Epoch 49/700
Epoch 50/700
Epoch 51/700
Epoch 52/700
Epoch 53/700
Epoch 54/700
Epoch 55/700
Epoch 56/700
Epoch 57/700
Epoch 58/700
Epoch 59/700
Epoch 60/700
Epoch 61/700
Epoch 62/700
Epoch 63/700
Epoch 64/700
Epoch 65/700
Epoch 66/700
Epoch 67/700
Epoch 68/700
Epoch 69/700
Epoch 70/700
Epoch 71/700
Epoch 72/700
Epoch 73/700
Epoch 74/700
Epoch 75/700
Epoch 76/700
Epoch 77/700
Epoch 78/700
Epoch 7

In [8]:
from keras_contrib.layers.crf import CRF, crf_loss, crf_viterbi_accuracy
model = load_model('kg-mode.model', custom_objects={"CRF": CRF, 'crf_loss': crf_loss,'crf_viterbi_accuracy': crf_viterbi_accuracy})




2022-09-28 05:27:23.515625: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-09-28 05:27:23.523713: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2399895000 Hz
2022-09-28 05:27:23.526880: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55db84dffaa0 executing computations on platform Host. Devices:
2022-09-28 05:27:23.526913: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


In [None]:

while True:
    str = input("原始句子>")
    predict(model,str,sentencesIds.shape[1],word2id,id2tag)

原始句子> 李白和王维同岁，都是孟浩然的好友
