In [None]:
from google.colab import drive
import os
drive.mount('/content/gdirve')
!pip3 install transformers

In [26]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader, random_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from IPython.display import clear_output
from collections import Counter
import matplotlib.pyplot as plt
import datetime
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import re

In [3]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
def setup_seed(seed):
    random.seed(seed)                          
    np.random.seed(seed)                       
    torch.manual_seed(seed)                    
    torch.cuda.manual_seed(seed)               
    torch.cuda.manual_seed_all(seed)           
    torch.backends.cudnn.deterministic = True
    
setup_seed(777)

In [28]:
#------------------------------------------------------
#------------------------------------------------------
def read_data(data_path):
  base_path = '/content/gdirve/MyDrive/Colab_Notebooks/WebIntelligence/NER/'
  sentences, tags = [], []
  sent = ''
  tag = []
  with open(base_path+data_path, 'r', encoding='utf8') as f:
    for line in f:
      if line == '\n':
        if len(sent) > 1:
          sentences.append(sent)
          tags.append(tag)
        sent = ''
        tag = []
      else:
        line = line.replace('\n', '')
        line = line.split('\t')
        sent = sent+line[0]
        if line[1] == '':
          line[1] = 'O'
        tag.append(line[1])
  return sentences, tags

In [29]:
#------------------------------------------------------
#------------------------------------------------------
def tokenization(sentences, labels):
  tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
  max_len = 128
  iids, atts, tags = [], [], []

  for s in sentences:
    result = tokenizer.encode_plus(s, add_special_tokens=True, max_length=max_len, padding='max_length'
      ,return_attention_mask=True, return_token_type_ids=False, truncation=True)
    iids.append(result['input_ids'])
    atts.append(result['attention_mask'])

  for label in labels:
    if len(label) > max_len-2:
      tag = label[0:max_len-2]
    else:
      tag = label
    tag = ['CLS'] + tag + ['SEP']
    if len(tag) < max_len:
      tag = tag+['PAD']*(max_len-len(tag))
    tags.append(tag)

  return np.asarray(iids, dtype='int32'), np.asarray(atts, dtype='int32'), tags

In [30]:
#------------------------------------------------------
# define the dataset
#------------------------------------------------------
class MyDataset(Dataset):
  def __init__(self, iids, atts, labels):
    self.iids = iids
    self.atts = atts
    self.labels = labels
    self.data_len = len(labels)

  def __getitem__(self, index):
    iid = self.iids[index]
    att = self.atts[index]
    label = torch.as_tensor(self.labels[index], dtype=torch.long)

    return {'input_ids':iid, 'attention_mask':att, 'label':label}

  def __len__(self):
    return self.data_len

In [31]:
#------------------------------------------------------
# define the model
#------------------------------------------------------
class Bert_CRF(nn.Module):
  def __init__(self, tag_num):
    super(Bert_CRF, self).__init__()
    base = 'bert-base-chinese'
    self.tag_num = tag_num
    self.bert = BertModel.from_pretrained(base)
    self.dropout = nn.Dropout(0.1)
    self.hidden2tag = nn.Linear(self.bert.config.hidden_size, self.tag_num)
  
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict=True
    )
    outputs = outputs['last_hidden_state']
    outputs = self.hidden2tag(outputs) # shape (sent_len, tag_num)
    return outputs

In [32]:
#------------------------------------------------------
# training configs
#------------------------------------------------------
batch_size = 16
epochs = 8
learn_rate = 5e-5

In [33]:
tags = ['B-LOC','I-LOC','B-ORG','I-ORG','B-PER','I-PER','O','CLS','SEP','PAD']
tag2id = {t: i for i, t in enumerate(tags)}
#{'B-LOC':0,'I-LOC':1,'B-ORG':2,'I-ORG':3,'B-PERSON':4,'I-PERSON':5,'O':6,'CLS':7,'SEP':7,'PAD':8}

#------------------------------------------------------
# prepare train data
#------------------------------------------------------
texts, labels = read_data('dataset/train.txt')
input_ids, att_masks, labels = tokenization(texts, labels)
y = []
for l in range(len(labels)):
  tag = []
  
  for label in labels[l]:
    tag.append(tag2id.get(label))
    if None in tag:
      print(labels[l])
      print(texts[l])
      print('=======================')
  y.append(tag)
  
y = np.asarray(y, dtype='int32')
train_set = MyDataset(input_ids, att_masks, y)
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=2)

In [34]:
#------------------------------------------------------
# prepare test data
#------------------------------------------------------
texts, labels = read_data('dataset/test.txt')
input_ids, att_masks, labels = tokenization(texts, labels)
y = []
for l in range(len(labels)):
  tag = []
  for label in labels[l]:
    tag.append(tag2id.get(label))
  y.append(tag)

y = np.asarray(y, dtype='int32')
test_set = MyDataset(input_ids, att_masks, y)
test_loader = DataLoader(dataset=test_set, batch_size=1)

In [12]:
#------------------------------------------------------
# define models and loss function
#------------------------------------------------------
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = Bert_CRF(len(tags)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
#------------------------------------------------------
# define optimizer and scheduler
#------------------------------------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=learn_rate)
steps_per_epoch = len(train_loader)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(num_train_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_steps)

In [14]:
#------------------------------------------------------
# train process
#------------------------------------------------------
for e in range(epochs):
  print('======== Epoch {:} / {:} ========'.format(e+1, epochs))
  #print('')
  model.train()
  batch_num = 0
  t0 = time.time()
  tr_loss = 0

  for batch in train_loader:
    batch_num += 1
    #clear_output(wait=True)
    print('\rBatch {:>5,} of {:>5,}'.format(batch_num, len(train_loader)),end='')

    iid = batch['input_ids'].to(device)
    att = batch['attention_mask'].to(device)
    y = batch['label'].to(device)

    #print('y.shape:',y.shape)

    logits = model(iid, att)
    logits_ = logits.permute(0,2,1)
    #if batch_num == 1:
      #print('logits.shape:',logits.shape)
      #print('logits_.shape',logits_.shape)
    loss = loss_fn(logits_, y)
    tr_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

  avg_train_loss = tr_loss/len(train_loader)
  training_time = format_time(time.time()-t0)
  print('')
  print("Average training loss: {0:.3f}".format(avg_train_loss))
  print("Training epcoh took: {:}".format(training_time))

Batch 2,813 of 2,813
Average training loss: 0.121
Training epcoh took: 0:17:21
Batch 2,813 of 2,813
Average training loss: 0.014
Training epcoh took: 0:17:22
Batch 2,813 of 2,813
Average training loss: 0.008
Training epcoh took: 0:17:21
Batch 2,813 of 2,813
Average training loss: 0.006
Training epcoh took: 0:17:23
Batch 2,813 of 2,813
Average training loss: 0.004
Training epcoh took: 0:17:23
Batch 2,813 of 2,813
Average training loss: 0.003
Training epcoh took: 0:17:21
Batch 2,813 of 2,813
Average training loss: 0.002
Training epcoh took: 0:17:22
Batch 2,813 of 2,813
Average training loss: 0.001
Training epcoh took: 0:17:22


In [17]:
save_path = '/content/gdirve/MyDrive/Colab_Notebooks/WebIntelligence/NER/model.pt'
torch.save(model.state_dict(), save_path)

In [35]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = Bert_CRF(len(tags)).to(device)
model.load_state_dict(torch.load('/content/gdirve/MyDrive/Colab_Notebooks/WebIntelligence/NER/model.pt'))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [42]:
print("")
print("Running Test...")
y_true = []
y_pred = []
model.eval()
for batch in test_loader:
  iid = batch['input_ids'].to(device)
  att = batch['attention_mask'].to(device)
  y = batch['label'].to(device)

  with torch.no_grad():
    logits = model(iid, att)
  logits = torch.argmax(logits, dim=2)
  logits = torch.flatten(logits.cpu().detach()).numpy()
  y = torch.flatten(y.cpu().detach()).numpy()
  for i in range(len(logits)):
    y_pred.append(logits[i])
    y_true.append(y[i])
#print(classification_report(y_true, y_pred,target_names=tags))


Running Test...


In [None]:
print(y_true)

In [46]:
print(classification_report(y_true, y_pred,target_names=tags))

              precision    recall  f1-score   support

       B-LOC       0.98      0.96      0.97      2743
       I-LOC       0.97      0.95      0.96      4172
       B-ORG       0.92      0.94      0.93      1252
       I-ORG       0.94      0.96      0.95      5141
       B-PER       0.98      0.97      0.97      1349
       I-PER       0.97      0.97      0.97      2544
           O       1.00      1.00      1.00    145502
         CLS       1.00      1.00      1.00      3442
         SEP       1.00      0.99      1.00      3442
         PAD       1.00      1.00      1.00    270989

    accuracy                           1.00    440576
   macro avg       0.97      0.97      0.97    440576
weighted avg       1.00      1.00      1.00    440576

