In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

In [3]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

Using TensorFlow backend.
I0120 17:01:20.458991 104920 file_utils.py:35] PyTorch version 1.4.0+cpu available.


In [None]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

This notebook work with env:

- Keras                2.3.1                 
- torch                1.1.0                 
- transformers         2.2.0      

# Introduction

In this notebook, will introduce how to do NER with BERT, including:

- Load and preprocess data
- Parser data
- Make training data
- Train model
- Evaluate result
- Predict result

Tips:

- Update to transformer==2.2.0
- When come across OOV,you will find that BERT word piece tokenize method can help a lot
- Case model will be litter better than uncase model for English

**Also this notebook come with a post [NER with BERT in Action](https://medium.com/@yingbiao/ner-with-bert-in-action-936ff275bc73)**<br>
**Feel free to check it, hope that it could help you.**

## Load data

**Load CSV data**

In [None]:
# data_path = "data/" 

In [4]:
data_file_address = r"C:\Piyush\NER\BERT\Affinity Waters Data\aw_dataframe.csv"

In [5]:
# Fillna method can make same sentence with same sentence name
df_data = pd.read_csv(data_file_address,sep=",",encoding="latin1").fillna(method='ffill')

In [6]:
df_data.columns

Index(['sentence', 'text', 'tag', 'pos'], dtype='object')

In [7]:
df_data1 = df_data.head(20)

**Have a look POS cat**

In [8]:
df_data.pos.unique()

array(['NN', 'CD', 'NNP', 'DT', 'JJ', 'VB', 'VBN', 'IN', 'PRP', 'PRP$',
       'RB', 'NNS', 'TO', 'VBG', 'VBP', 'CC', 'MD', 'WDT', 'VBZ', 'VBD',
       'WP', 'LS', 'JJS', 'EX', 'WRB', 'RBR', 'SYM', 'FW', 'WP$', 'JJR',
       'POS'], dtype=object)

**Have a look TAG cat**

In [9]:
df_data.tag.unique()

array(['others', 'new_address', 'new_men', 'new_tenant', 'move_in_date',
       'current_address', 'previous_tenant', 'previous_men',
       'move_out_date', 'meter_men', 'meter_reading_val',
       'forwarding_men', 'forwarding_address', 'current_men',
       'meter_reading_date', 'contact_men', 'contact_val', 'out_men',
       'prev_address'], dtype=object)

In [10]:
# Analyse summary of data
df_data['sentence'].nunique(), df_data.text.nunique(), df_data.pos.nunique(), df_data.tag.nunique()

(534, 8338, 31, 19)

In [11]:
# Analyse the Tag distribution
df_data.tag.value_counts()

others                35992
current_address        4099
forwarding_address     1681
new_tenant             1661
previous_tenant        1023
move_in_date            940
move_out_date           547
new_address             383
new_men                 313
meter_men               299
prev_address            173
out_men                 160
forwarding_men          149
meter_reading_val       139
previous_men            123
contact_val             116
meter_reading_date       71
current_men              70
contact_men              54
Name: tag, dtype: int64

### Explain tag
As show above, there are two parts for the tag name: "position"-"meaning"
- B: begin, word at the first  position
- I: middle, word not at the first position,especially for phase
- time: time, meaning time
- per: person, meaning people name
- geo: geography, meaning location name
- O: mean other, set as a default tag
<br>......

## Parser data

**Parser data into document structure**

In [12]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["text"].values.tolist(),
                                                           s["pos"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [13]:
# Get full document data struce
getter = SentenceGetter(df_data)

In [14]:
# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[0]

['Dear',
 'Sir/Madam',
 'Re:',
 '6',
 'Field',
 'view',
 'court,',
 'Kingsbury,',
 'OX0',
 '0TE.',
 'The',
 'new',
 'tenant,',
 'Ms.',
 'Naximilian',
 'Clajut,',
 'have',
 'moved',
 'into',
 'the',
 'above',
 'property',
 'on',
 'the',
 '10"',
 'October',
 '2017.',
 'Herewith',
 'we',
 'have',
 'attached',
 'the',
 'Tenancy',
 'Agreement',
 'for',
 'your',
 'consideration.',
 'Kindly',
 'amend',
 'your',
 'records',
 'accordingly.',
 'For',
 'any',
 'further',
 'information',
 'please',
 'do',
 'not',
 'hesitate',
 'to',
 'contact',
 'us.',
 'Thank',
 'you.',
 'Regards,',
 'HUB',
 '1',
 '%-',
 '//j///////',
 '2%',
 'APT',
 'anie']

In [15]:
# Get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

['NN', 'NN', 'NN', 'CD', 'NN', 'NN', 'NN', 'NNP', 'NN', 'CD', 'DT', 'JJ', 'NN', 'NNP', 'JJ', 'NN', 'VB', 'VBN', 'IN', 'DT', 'IN', 'NN', 'IN', 'DT', 'CD', 'NNP', 'CD', 'NN', 'PRP', 'VB', 'VBN', 'DT', 'NN', 'NN', 'IN', 'PRP$', 'NN', 'RB', 'NN', 'PRP$', 'NNS', 'NN', 'IN', 'DT', 'RB', 'NN', 'NN', 'VB', 'RB', 'NN', 'TO', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'CD', 'NN', 'NN', 'CD', 'NN', 'NN']


In [16]:
# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['others', 'others', 'others', 'new_address', 'new_address', 'new_address', 'new_address', 'new_address', 'new_address', 'new_address', 'others', 'new_men', 'others', 'new_tenant', 'new_tenant', 'new_tenant', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'move_in_date', 'move_in_date', 'move_in_date', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others', 'others']


**Make TAG name into index for training**

In [17]:
tags_vals = list(set(df_data["tag"].values))

In [18]:
# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

In [19]:
tags_vals = set(tags_vals)

In [20]:
tags_vals

{'X',
 '[CLS]',
 '[SEP]',
 'contact_men',
 'contact_val',
 'current_address',
 'current_men',
 'forwarding_address',
 'forwarding_men',
 'meter_men',
 'meter_reading_date',
 'meter_reading_val',
 'move_in_date',
 'move_out_date',
 'new_address',
 'new_men',
 'new_tenant',
 'others',
 'out_men',
 'prev_address',
 'previous_men',
 'previous_tenant'}

In [21]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'out_men': 14,
 'previous_men': 16,
 'meter_men': 0,
 'new_tenant': 13,
 'new_men': 12,
 'move_out_date': 10,
 'current_men': 4,
 'contact_val': 2,
 'forwarding_address': 5,
 'forwarding_men':21,
 'meter_reading_val': 7,
 'prev_address': 15,
 'move_in_date': 8,
 'new_address': 11,
 'current_address': 3,
 'meter_reading_date': 6,
 'contact_men': 1,
 'previous_tenant': 20,
 'X':17,
 'others': 9,
 '[CLS]':18,
 '[SEP]':19}

In [22]:
tag2idx

{'out_men': 14,
 'previous_men': 16,
 'meter_men': 0,
 'new_tenant': 13,
 'new_men': 12,
 'move_out_date': 10,
 'current_men': 4,
 'contact_val': 2,
 'forwarding_address': 5,
 'forwarding_men': 21,
 'meter_reading_val': 7,
 'prev_address': 15,
 'move_in_date': 8,
 'new_address': 11,
 'current_address': 3,
 'meter_reading_date': 6,
 'contact_men': 1,
 'previous_tenant': 20,
 'X': 17,
 'others': 9,
 '[CLS]': 18,
 '[SEP]': 19}

In [23]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [24]:
tag2name

{14: 'out_men',
 16: 'previous_men',
 0: 'meter_men',
 13: 'new_tenant',
 12: 'new_men',
 10: 'move_out_date',
 4: 'current_men',
 2: 'contact_val',
 5: 'forwarding_address',
 21: 'forwarding_men',
 7: 'meter_reading_val',
 15: 'prev_address',
 8: 'move_in_date',
 11: 'new_address',
 3: 'current_address',
 6: 'meter_reading_date',
 1: 'contact_men',
 20: 'previous_tenant',
 17: 'X',
 9: 'others',
 18: '[CLS]',
 19: '[SEP]'}

## Make training data

Make raw data into trainable data for BERT, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [26]:
n_gpu

0

### Load tokenizer

You can download the tokenizer file into local folder first :
- [vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt)

In [27]:
# Manual define vocabulary address, if you download the tokenzier file in local
# vocab.txt, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
vocabulary = r"C:\Piyush\NER\BERT\cased_L-12_H-768_A-12\vocab.txt"

In [28]:
# Len of the sentence must be not bigger than the training model
# See model's 'max_position_embeddings' = 512
max_len  = 45

In [29]:
# load tokenizer, with manual file address or pretrained address
tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False)

**Tokenizer text**

- In hunggieface for bert, when come across OOV, will word piece the word
- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X" 
- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, [BERT indexer should add [CLS] and [SEP] tokens](https://github.com/allenai/allennlp/issues/2141)


In [30]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences,labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)    
                
            else:
                if token.startswith('##'):
                    temp_lable.append('X')
                else:
                    temp_lable.append(lab)
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1


No.0,len:112
texts:[CLS] Dear Sir / Mad ##am Re : 6 Field view court , Kings ##bury , O ##X ##0 0 ##TE . The new tenant , Ms . Na ##xi ##mi ##lian C ##la ##ju ##t , have moved into the above property on the 10 " October 2017 . Here ##with we have attached the Ten ##ancy Agreement for your consideration . Kind ##ly am ##end your records accordingly . For any further information please do not hesitate to contact us . Thank you . Reg ##ards , H ##U ##B 1 % - / / j / / / / / / / 2 % AP ##T an ##ie [SEP]
No.0,len:112
lables:[CLS] others others others others X others others new_address new_address new_address new_address new_address new_address X new_address new_address X X new_address X new_address others new_men others others new_tenant new_tenant new_tenant X X X new_tenant X X X new_tenant others others others others others others others others move_in_date move_in_date move_in_date move_in_date move_in_date others X others others others others others X others others others others others

### Set token embedding

Pad or trim the text and label to fit the need for max len

In [73]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101 12956  2203   120 10779  2312 11336   131   127  3479  2458  2175
   117  6560  4109   117   152  3190  1568   121 12880   119  1109  1207
 19197   117  6980   119 11896  8745  3080 15647   140  1742  9380  1204
   117  1138  1427  1154  1103  1807  2400  1113  1103]


In [74]:
# Make label into id, pad with "O" meaning others
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["others"], padding="post",
                     dtype="long", truncating="post")
print(tags[0])

[18  9  9  9  9 17  9  9 11 11 11 11 11 11 17 11 11 17 17 11 17 11  9 12
  9  9 13 13 13 17 17 17 13 17 17 17 13  9  9  9  9  9  9  9  9]


### Set mask word embedding

In [75]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

### Set segment embedding(Seem like for sequence tagging task, it's not necessary to make this embedding)

In [76]:
# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

## Split data into train and validate

70% for training, 30% for validation

**Split all data**

In [77]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags,attention_masks,segment_ids, 
                                                            random_state=4, test_size=0.3)

In [78]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(373, 161, 373, 161)

**Set data into tensor**

Not recommend tensor.to(device) at this process, since it will run out of GPU memory

In [93]:
tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
val_inputs = torch.tensor(val_inputs).to(torch.int64)
tr_tags = torch.tensor(tr_tags).to(torch.int64)
val_tags = torch.tensor(val_tags).to(torch.int64)
tr_masks = torch.tensor(tr_masks).to(torch.int64)
val_masks = torch.tensor(val_masks).to(torch.int64)
tr_segs = torch.tensor(tr_segs).to(torch.int64)
val_segs = torch.tensor(val_segs).to(torch.int64)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  


**Put data into data loader**

In [94]:
# Set batch num
batch_num = 32

In [95]:
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

- You can download the model require files into local folder first
- pytorch_model.bin: [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin)
- config.json: [config.json](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json)    

**Load BERT model**

In [96]:
# In this folder, contain model confg(json) and model weight(bin) files
# pytorch_model.bin, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin
# config.json, downlaod from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json
model_file_address = 'models/bert-base-cased/'

In [97]:
# Will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

I0120 17:10:51.770915 104920 configuration_utils.py:182] loading configuration file models/bert-base-cased/config.json
I0120 17:10:51.773586 104920 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 22,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

I0120 17:10:51.777578 104920 modeling_utils.py:403] loading weights file models/bert-base-cased/pytorch_model.bin
I0120 17:10:5

In [98]:
model;

In [99]:
# Set model to GPU,if you are using GPU machine
# model.cuda();

In [100]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [122]:
# Set epoch and grad max num
epochs = 10
max_grad_norm = 1.0

In [102]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [103]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [104]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

### Fine-tuning model

In [105]:
# TRAIN loop
model.train();

In [123]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
        

***** Running training *****
  Num examples = 373
  Batch size = 32
  Num steps = 60


Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]

Train loss: 0.17857461016286502


Epoch:  10%|███████▌                                                                   | 1/10 [01:44<15:36, 104.05s/it]

Train loss: 0.18082312562248923


Epoch:  20%|███████████████▏                                                            | 2/10 [03:00<12:45, 95.70s/it]

Train loss: 0.18336716091090982


Epoch:  30%|██████████████████████▊                                                     | 3/10 [04:21<10:38, 91.28s/it]

Train loss: 0.18060225248336792


Epoch:  40%|██████████████████████████████▍                                             | 4/10 [05:39<08:43, 87.33s/it]

Train loss: 0.18200187520547348


Epoch:  50%|██████████████████████████████████████                                      | 5/10 [06:54<06:57, 83.56s/it]

Train loss: 0.1870760037140413


Epoch:  60%|█████████████████████████████████████████████▌                              | 6/10 [08:07<05:22, 80.58s/it]

Train loss: 0.1801903315565803


Epoch:  70%|█████████████████████████████████████████████████████▏                      | 7/10 [09:21<03:55, 78.65s/it]

Train loss: 0.1768119355494326


Epoch:  80%|████████████████████████████████████████████████████████████▊               | 8/10 [10:36<02:34, 77.39s/it]

Train loss: 0.18117023868994278


Epoch:  90%|████████████████████████████████████████████████████████████████████▍       | 9/10 [11:47<01:15, 75.66s/it]

Train loss: 0.174408227882602


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [12:58<00:00, 74.06s/it]


## Save model 

In [124]:
bert_out_address = 'models/bert_out_model/en09'

In [125]:
# Make dir if not exits
if not os.path.exists(bert_out_address):
        os.makedirs(bert_out_address)

In [126]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [127]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [128]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('models/bert_out_model/en09\\vocab.txt',)

## Load model

In [129]:
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=len(tag2idx))

I0120 18:18:34.420526 104920 configuration_utils.py:182] loading configuration file models/bert_out_model/en09\config.json
I0120 18:18:34.434521 104920 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 22,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

I0120 18:18:34.440505 104920 modeling_utils.py:403] loading weights file models/bert_out_model/en09\pytorch_model.bin


In [130]:
# Set model to GPU
# model.cuda();

In [131]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [132]:
# Evalue loop
model.eval();

In [133]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =161
  Batch size = 32
f1 socre: 0.537743
Accuracy score: 0.895138
***** Eval results *****

                    precision    recall  f1-score   support

            others     0.6166    0.6053    0.6109       380
   current_address     0.7125    0.8201    0.7625       139
        new_tenant     0.1194    0.2581    0.1633        31
           out_men     0.0000    0.0000    0.0000        10
      previous_men     0.0000    0.0000    0.0000        10
      move_in_date     0.2105    0.3478    0.2623        23
           new_men     0.5000    0.3438    0.4074        32
         meter_men     0.0000    0.0000    0.0000         6
       new_address     0.0000    0.0000    0.0000         5
   previous_tenant     0.0909    0.1000    0.0952        30
     move_out_date     0.0000    0.0000    0.0000        10
      prev_address     0.0000    0.0000    0.0000         5
       current_men     0.0000    0.0000    0.0000         6
 meter_reading_val  