In [1]:
import numpy as np
from sklearn.model_selection import ShuffleSplit
from data_utils import ENTITIES, Documents, Dataset, SentenceExtractor, make_predictions
from data_utils import Evaluator
from models import build_lstm_crf_model
from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
ENTITIES

['GO_UBERON_EXT',
 'CHEBI_GO_SO_EXT',
 'SO',
 'CHEBI_GO_EXT',
 'PR_EXT',
 'CHEBI_EXT',
 'PATO_SO_EXT',
 'CHEBI_CHMO_EXT',
 'CL_EXT',
 'GO_PATO_UBERON_EXT',
 'MOP',
 'GO_PATO_EXT',
 'GO_RO_EXT',
 'OBI_SO_EXT',
 'GO_EXT',
 'CHEBI_PR_EXT',
 'CHEBI_SO_EXT',
 'CHEBI_MOP_EXT',
 'CL_UBERON_EXT',
 'PATO_UBERON_EXT',
 'NCBITaxon_EXT',
 'CHEBI_UBERON_EXT',
 'UBERON',
 'CHMO_UBERON_EXT',
 'GO_SO_EXT',
 'NCBITaxon',
 'SO_EXT',
 'GO_PR_EXT',
 'PR',
 'CL_GO_EXT',
 'UBERON_EXT',
 'GO_PATO_RO_EXT',
 'CL',
 'CHEBI_GO_PR_EXT',
 'CHEBI',
 'CHEMINF_GO_EXT',
 'NCBITaxon_UBERON_EXT',
 'GO_MOP_EXT',
 'GO']

### 瑞金医院数据集

In [3]:
data_dir = 'brat/'
ent2idx = dict(zip(ENTITIES, range(1, len(ENTITIES) + 1)))
idx2ent = dict([(v, k) for k, v in ent2idx.items()])

In [4]:
docs = Documents(data_dir=data_dir)
rs = ShuffleSplit(n_splits=1, test_size=20, random_state=2018)
train_doc_ids, test_doc_ids = next(rs.split(docs))
train_docs, test_docs = docs[train_doc_ids], docs[test_doc_ids]

In [5]:
train_docs[0]

<data_utils.data_utils.Document at 0x1ef022ab710>

In [15]:
num_cates = max(ent2idx.values()) + 1
sent_len = 64
vocab_size = 3000
emb_size = 100
sent_pad = 10
sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad)
train_sents = sent_extrator(train_docs)
test_sents = sent_extrator(test_docs)
train_data = Dataset(train_sents, cate2idx=ent2idx)
train_data.build_vocab_dict(vocab_size=vocab_size)
test_data = Dataset(test_sents, word2idx=train_data.word2idx, cate2idx=ent2idx)
vocab_size = len(train_data.word2idx)

In [16]:
w2v_train_sents = []
for doc in docs:
    w2v_train_sents.append(list(doc.text))
    
    
w2v_model = Word2Vec(w2v_train_sents, size=emb_size)
w2v_embeddings = np.zeros((vocab_size, emb_size))
for char, char_idx in train_data.word2idx.items():
    if char in w2v_model.wv:
        w2v_embeddings[char_idx] = w2v_model.wv[char]

In [17]:
seq_len = sent_len + 2 * sent_pad
model = build_lstm_crf_model(num_cates, seq_len=seq_len, vocab_size=vocab_size, 
                             model_opts={'emb_matrix': w2v_embeddings, 'emb_size': 100, 'emb_trainable': False})
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 84)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 84, 100)           13300     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 84, 512)           731136    
_________________________________________________________________
crf_1 (CRF)                  (None, 84, 13)            6864      
Total params: 751,300
Trainable params: 738,000
Non-trainable params: 13,300
_________________________________________________________________


In [18]:
train_X, train_y = train_data[:]
print('train_X.shape', train_X.shape)
print('train_y.shape', train_y.shape)

KeyError: 'GO_0007416'

### 训练模型

In [8]:
model.fit(train_X, train_y, batch_size=64, epochs=10)

NameError: name 'model' is not defined

### 测试结果

In [10]:
test_X, _ = test_data[:]
preds = model.predict(test_X, batch_size=64, verbose=True)
pred_docs = make_predictions(preds, test_data, sent_pad, docs, idx2ent)



In [11]:
f_score, precision, recall = Evaluator.f1_score(test_docs, pred_docs)
print('f_score: ', f_score)
print('precision: ', precision)
print('recall: ', recall)

f_score:  0.7672501873169403
precision:  0.7627302275189599
recall:  0.7718240372755927


In [12]:
sample_doc_id = list(pred_docs.keys())[0]
test_docs[sample_doc_id]

In [13]:
pred_docs[sample_doc_id]

In [5]:
#load file
def txt_strtonum_feed1(filename):
    datax = []
    datay = []
    flag=True
    with open(filename, 'r', encoding='UTF-8') as f:#with auto call close()
        line = f.readline()

        while line:
            
            read_data = line.split('\n')
            if flag:
                flag=False
                line = f.readline()
                continue
                
            
            
            read_data = read_data[0].split(' \t ')
            if read_data[0]=="[labels]":
                break
            #print(read_data)    
            datax.append(read_data[0])
            read_data[1] = read_data[1].split(":")[1]
            datay.append(read_data[1])
            
            
            line = f.readline()
        return datax, datay

In [6]:
data_setx1,data_sety1 = txt_strtonum_feed1("visual.conf")


In [7]:
string = ""
k=0
for i in data_setx1:
    string += "\""+i+"\","
    if k > 10:
        break
    k+=1
print(string)

"NCBITaxon_7729","GO_0048665","MOP_0000789","PR_Q64028","UBERON_0006875","CHEBI_9574","NCBITaxon_7735","UBERON_0004208","UBERON_0004209","CHEBI_9561","GO_0002437","GO_0050975",


[]

In [9]:
string = ""
k=0
for i in data_sety1:
    string += "\""+i+"\","
    if k > 10:
        break
    k+=1
print(string)

"#ffff00","#00ffff","#deb887","#00ffff","#5f9ea0","#32cd32","#ffff00","#5f9ea0","#5f9ea0","#32cd32","#00ffff","#00ffff",
