# NER Predict with BertForTokenClassification - [HAILab-PUCPR](https://github.com/HAILab-PUCPR)

In [1]:
import torch
from transformers import BertTokenizer,AutoTokenizer
import numpy as np
import json

In [27]:
def predictBERTNER(sentencas,MODEL_DIR):
        
    model = torch.load(MODEL_DIR + '/torch_model',map_location=torch.device('cpu'))
    tokenizer = BertTokenizer.from_pretrained(MODEL_DIR, do_lower_case=True) # lower or not, this is important

    with open(MODEL_DIR + '/idx2tag.json', 'r') as filehandle:
        idx2tag = json.load(filehandle) 
        
    predictedModel=[]
    
    for test_sentence in sentencas:
        tokenized_sentence = tokenizer.encode(test_sentence)
        input_ids = torch.tensor([tokenized_sentence])#.cuda()
        
        with torch.no_grad():
            output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
        
        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(label_idx)
                new_tokens.append(token)
            
        FinalLabelSentence = []
        for token, label in zip(new_tokens, new_labels):
            label = idx2tag[str(label)]
            #print("{}\t{}".format(label, token))   #Comment
            if label == "O" or label == "X":
                FinalLabelSentence.append("O")
            else:
                FinalLabelSentence.append(label)
                
        predictedModel.append(FinalLabelSentence[1:-1]) # delete [SEP] and [CLS]
        
            
    return predictedModel

In [17]:
import nltk    
from nltk import tokenize    

# THE MODEL ACCEPTS ONLY LOWER
test_sentence1 = "Paciente com Sepse pulmonar em D8 tazocin (paciente não recebeu por 2 dias Atb).".lower()
test_sentence2 = "Acesso venoso central em subclavia D duplolumen recebendo solução salina e glicosada em BI.".lower()

test_sentence_tokenized = [tokenize.word_tokenize(test_sentence1, language='portuguese'),tokenize.word_tokenize(test_sentence2, language='portuguese')] 
print(test_sentence_tokenized)

[['paciente', 'com', 'sepse', 'pulmonar', 'em', 'd8', 'tazocin', '(', 'paciente', 'não', 'recebeu', 'por', '2', 'dias', 'atb', ')', '.'], ['acesso', 'venoso', 'central', 'em', 'subclavia', 'd', 'duplolumen', 'recebendo', 'solução', 'salina', 'e', 'glicosada', 'em', 'bi', '.']]


Please download the [NER model](https://github.com/HAILab-PUCPR/BioBERTpt/tree/master/model) and unzip here

In [28]:
MODEL_DIR = r"biobert-all-clinpt"
tags = predictBERTNER(test_sentence_tokenized,MODEL_DIR)
tags

[['O',
  'O',
  'B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'O',
  'O',
  'B-N',
  'B-THER',
  'O',
  'I-DT',
  'I-DT',
  'O',
  'O',
  'O'],
 ['B-AS',
  'B-AS',
  'O',
  'O',
  'B-AS',
  'I-R',
  'X',
  'X',
  'X',
  'X',
  'O',
  'B-AS',
  'O',
  'I-R',
  'O']]

In [23]:
for s,sa in zip(test_sentence_tokenized,tags):
    for t, a in zip(s,sa):
        print(t,a)
    print()

paciente O
com O
sepse B-C
pulmonar I-C
em I-C
d8 I-C
tazocin I-C
( O
pciente O
não B-N
recebeu B-THER
por O
2 I-DT
dias I-DT
atb O
) O
. O

acesso B-AS
venoso B-AS
central O
em O
subclavia B-AS
d I-R
duplolumen X
recebendo X
solução X
salina X
e O
glicosada B-AS
em O
bi I-R
. O



In [None]:
#Reference: 
#CH: Characterization; 
#T: Test; 
#EV: Evolution; 
#G: Genetics; 
#AS: Anatomical Site; 
#N: Negation; 
#OBS: Additional Observations; 
#C: Condition; 
#R: Results; 
#DT: DateTime; 
#THER: Therapeutics; 
#V: Value; 
#RA: Route of Administration; 
#O: Out