In [1]:
import pandas as pd
import math
import numpy as np
import torch.nn.functional as F

In [None]:
!pip install transformers

In [None]:
!pip install seqeval

In [4]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, AutoModelWithLMHead
from transformers import BertForTokenClassification, AdamW
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score

Using TensorFlow backend.


Load data

In [5]:
df_data = pd.read_csv('/content/drive/My Drive/data/ner_dataset.csv')

In [6]:
df_data.head(5)
df_data = df_data[:10000]

Combine words that in the same sentence *together*

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(df_data)
sentences = [[s[0] for s in sent] for sent in getter.sentences]
labels = [[s[2] for s in sent] for sent in getter.sentences]

In [9]:
df_data.Tag.value_counts()

O        8483
B-gpe     303
B-geo     244
I-per     206
B-org     176
B-per     160
B-tim     149
I-org     140
I-geo      31
B-art      28
I-gpe      20
I-art      20
I-tim      13
B-eve      10
I-eve      10
B-nat       5
I-nat       2
Name: Tag, dtype: int64

In [10]:
print(sentences[0])
print(labels[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


Tokenize words, create mask id, segment id

In [11]:
vocab = '/content/drive/My Drive/data/cased_L-12_H-768_A-12/vocab.txt'
tokenizer=BertTokenizer(vocab_file=vocab,do_lower_case=False)

In [12]:
txt_token = []
token_label = [] # sentences num * word in sentence
for sentence, label1 in zip(sentences, labels):
  word_list, label_list = ['[CLS]'], ['[CLS]']
  for word, label in zip(sentence, label1):
    token_list=tokenizer.tokenize(word)
    for i, j in enumerate(token_list):
      word_list.append(j)
      if i == 0:
        label_list.append(label)
      else:
        label_list.append('X') #explore
  word_list.append('[SEP]')
  label_list.append('[SEP]')
  txt_token.append(word_list)
  token_label.append(label_list)       

print(txt_token[0])
print(token_label[0])

['[CLS]', 'Thousands', 'of', 'demons', '##tra', '##tors', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.', '[SEP]']
['[CLS]', 'O', 'O', 'O', 'X', 'X', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', '[SEP]']


Set token embedding, pad or trim the sequence to the max length

In [13]:
max_len = np.median([len(x) for x in txt_token]) 
max_len= int(1.5*max_len)
print(max_len)

40


In [14]:
input_ids = []
for i in txt_token:
  temp = tokenizer.convert_tokens_to_ids(i)
  #print(temp)
  input_ids.append(temp)
input_ids = pad_sequences(input_ids, maxlen=max_len,dtype='long',padding='post', truncating='post')
print(input_ids[0])

[  101 26159  1104  8568  4487  5067  1138  9639  1194  1498  1106  5641
  1103  1594  1107  5008  1105  4555  1103 10602  1104  1418  2830  1121
  1115  1583   119   102     0     0     0     0     0     0     0     0
     0     0     0     0]


In [15]:
# create mask id
mask = [[0 if j==0 else 1 for j in i] for i in input_ids]
# create segment id
segment_id = np.zeros(input_ids.shape)

Convert tags to labels

In [16]:
# make convert dictionary
tag2id = {}
j=1
for i in df_data['Tag'].unique():
  tag2id[i]=j
  j+=1
tag2id['X']=17
tag2id['[CLS]']=18
tag2id['[SEP]']=19
# tag2id

In [17]:
output_ids=[]
for i in token_label:
  temp=[]
  for j in i:
    temp.append(tag2id[j])
  output_ids.append(temp)
output_ids = pad_sequences(output_ids, maxlen=max_len,dtype='long',padding='post', truncating='post')
print(output_ids[0])

[18  1  1  1 17 17  1  1  1  2  1  1  1  1  1  2  1  1  1  1  1  3  1  1
  1  1  1 19  0  0  0  0  0  0  0  0  0  0  0  0]


Convert labels back to tags

In [18]:
# make id to tag dictionary
id2tag = {tag2id[key]:key for key in tag2id.keys()}

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

Split data into training and validation 80% for train 20% for validation

In [20]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = \
train_test_split(input_ids, output_ids, mask,segment_id, \
                 random_state=4, test_size=0.2)


Set data into tensor

In [21]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

# put data into data loader

In [22]:
batch_size = 10
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size, drop_last=True)

Train model

In [23]:
model = BertForTokenClassification.from_pretrained('/content/drive/My Drive/data/bert_model', num_labels=20)

In [None]:
model.cuda()

In [25]:
epochs = 5
max_grad_norm = 1.0
num_train_optimization_steps = int( len(tr_inputs) / batch_size) * epochs
num_train_optimization_steps = 10

Set pre-trained Bert model for word embedding

In [26]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

In [27]:
model.train()
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_size))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs, desc='Epoch'):
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    # forward pass
    outputs = model(b_input_ids, token_type_ids = None, \
                    attention_mask = b_input_mask, labels=b_labels)
    loss, scores = outputs[:2]

    if n_gpu>1:
      loss = loss.mean()

    # backward pass
    loss.backward()

    # track train loss
    tr_loss += loss.item()
    nb_tr_examples += batch_size
    nb_tr_steps += 1

    # gradient clipping
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    
    # update parameters
    optimizer.step()
    optimizer.zero_grad()
  # print train loss per epoch
  print('Train loss: {}'.format(tr_loss/nb_tr_steps))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 365
  Batch size = 10
  Num steps = 10


Epoch:  20%|██        | 1/5 [00:02<00:10,  2.63s/it]

Train loss: 1.208073819677035


Epoch:  40%|████      | 2/5 [00:05<00:07,  2.62s/it]

Train loss: 0.34124322090711856


Epoch:  60%|██████    | 3/5 [00:07<00:05,  2.62s/it]

Train loss: 0.1312011725579699


Epoch:  80%|████████  | 4/5 [00:10<00:02,  2.61s/it]

Train loss: 0.06790051609277725


Epoch: 100%|██████████| 5/5 [00:13<00:00,  2.60s/it]

Train loss: 0.03716581019883355





Eval model

In [28]:
bert_out_address = '/content/saved model'
model_to_save = model
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('/content/saved model/vocab.txt',)

In [29]:
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=20)


In [None]:
model.cuda()
model.eval()


In [31]:
eval_loss, eval_accuracy = 0, 0
nb_eval_step, nb_eval_example = 0, 0
y_true, y_pred = [], []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_size))

for step, batch in enumerate(valid_dataloader):
  batch = tuple(t.to(device) for t in batch)
  input_ids, input_mask, label_ids = batch
  with torch.no_grad():
    outputs = model(input_ids, token_type_ids = None,
                    attention_mask = input_mask,)
    logits = outputs[0]
    logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    for i, mask in enumerate(input_mask):
      temp_1 = []
      temp_2 = []
      for j, m in enumerate(mask):
        # # print(label_ids[i,j])
        if m:
          if id2tag[label_ids[i][j]] != 'X' and id2tag[label_ids[i][j]] != '[SEP]':
            if id2tag[label_ids[i][j]] != '[CLS]':
              temp_1.append(id2tag[label_ids[i][j]])
              temp_2.append(id2tag[logits[i][j]])
          else:
              break
      y_true.append(temp_1)
      y_pred.append(temp_2)

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
report = classification_report(y_true, y_pred)
print(report)

***** Running evaluation *****
  Num examples =92
  Batch size = 10
f1 socre: 0.909871
Accuracy score: 0.986592
           precision    recall  f1-score   support

      per       0.93      0.88      0.90        16
      org       0.74      0.82      0.78        17
      geo       0.97      0.91      0.94        34
      gpe       0.91      0.97      0.94        33
      tim       1.00      1.00      1.00        15
      art       0.00      0.00      0.00         1

micro avg       0.91      0.91      0.91       116
macro avg       0.91      0.91      0.91       116

