https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
data = pd.read_csv("ner_dataset.csv", encoding = "latin1").fillna(method = "ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [3]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,p,t) for w, p, t in zip (s['Word'].values.tolist(),
                                                          s["POS"].values.tolist(),
                                                          s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
getter = SentenceGetter(data)

In [5]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]

In [6]:
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [7]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [8]:
tag_values = list(set(data['Tag'].values))
tag_values.append("Pad")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [33]:
tag_values

['I-per',
 'B-tim',
 'B-eve',
 'I-eve',
 'B-nat',
 'B-org',
 'B-per',
 'I-gpe',
 'B-gpe',
 'I-geo',
 'O',
 'B-geo',
 'I-art',
 'I-tim',
 'I-org',
 'I-nat',
 'B-art',
 'Pad']

In [32]:
tag2idx

{'I-per': 0,
 'B-tim': 1,
 'B-eve': 2,
 'I-eve': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'I-gpe': 7,
 'B-gpe': 8,
 'I-geo': 9,
 'O': 10,
 'B-geo': 11,
 'I-art': 12,
 'I-tim': 13,
 'I-org': 14,
 'I-nat': 15,
 'B-art': 16,
 'Pad': 17}

In [9]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.5.0'

In [10]:
MAX_LEN = 75
bs = 32

In [11]:
torch.cuda.is_available()

False

In [12]:
device = torch.device("cpu")

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

In [14]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    
    for word, label in zip(sentence, text_labels):
        #Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        
        #Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)
        
        #Add the same label to the new list of labels 'n_subwords' times
        labels.extend([label]*n_subwords)
        
    return tokenized_sentence, labels

In [15]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]

In [16]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [40]:
print(type(tokenized_texts[5][1]))

<class 'str'>


In [17]:
#Next we cut and pad the token and label sequ. to our des. length
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                         maxlen = MAX_LEN, dtype = 'long', truncating = "post", padding = "post")

In [18]:
print(input_ids)

[[26159  1104  8568 ...     0     0     0]
 [ 7239  3878  1474 ...     0     0     0]
 [ 1124  8031  4184 ...     0     0     0]
 ...
 [ 2485  3398   112 ...     0     0     0]
 [ 1967  1173   117 ...     0     0     0]
 [ 1109  1244  3854 ...     0     0     0]]


In [19]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                    maxlen = MAX_LEN, value = tag2idx["Pad"], padding = "post",
                    dtype='long', truncating = 'post')

In [41]:
print(tags)

[[10 10 10 ... 17 17 17]
 [ 8 10 10 ... 17 17 17]
 [10 10 10 ... 17 17 17]
 ...
 [10 11 10 ... 17 17 17]
 [10 10 10 ... 17 17 17]
 [10  5 14 ... 17 17 17]]


In [20]:
#These allow us to mask anything called a pad.
attention_masks = [[float(i != tag2idx['Pad']) for i in ii] for ii in input_ids]

In [21]:
#Now build our train test splits
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state = 2018, test_size = 0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                            random_state = 2018, test_size = 0.1)

In [22]:
#Ok, now convert to torch tensors for working in pytorch
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [34]:
print(tr_tags)

tensor([[10, 10, 10,  ..., 17, 17, 17],
        [ 5, 10, 10,  ..., 17, 17, 17],
        [ 5,  5,  5,  ..., 17, 17, 17],
        ...,
        [ 5, 10, 10,  ..., 17, 17, 17],
        [10, 10, 10,  ..., 17, 17, 17],
        [10,  8, 10,  ..., 17, 17, 17]], dtype=torch.int32)


In [23]:
print(tr_inputs)

tensor([[ 1335,  1655,  1421,  ...,     0,     0,     0],
        [15769,  1163,  1199,  ...,     0,     0,     0],
        [  138,  2315,  2430,  ...,     0,     0,     0],
        ...,
        [16228,  1144,   170,  ...,     0,     0,     0],
        [23077,   117,  1126,  ...,     0,     0,     0],
        [ 1109,  1938,  2078,  ...,     0,     0,     0]], dtype=torch.int32)


Ok, now the final step is to define the dataloaders. We shuffle the data at training time with the `RandomSampler` and at test time we just pass them sequentially using the `SequentialSampler` functions we imported above

In [24]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = bs)


valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size =  bs)


Ok, now we need to set up the BERT model and fine tune it on our dataset. 

Here's what the demo says:
The transformer package provides a BertForTokenClassification class for token-level predictions. BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel. The token-level classifier is a linear layer that takes as input the last hidden state of the sequence. We load the pre-trained bert-base-cased model and provide the number of possible labels.

In [25]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

'2.8.0'

In [26]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




In [27]:
#model.cpu()

Ok, before we can setup the fine-tuning training of the BERT model, we need to define optimizers and add parameters for the training to update. A common choice is the AdamW optimizer (IDK WHY). 

We'll add weight_decay as regularization to the main weight matrices -- This is a way to prevent overfitting - If we reduce the metaphorical "weight" with which we are descending in a gradient descent algorithm, we essentially slow down the rate as we get closer and closer to the metaphorical bottom of our potential space. 

It says we can also just try to train a linear classifier on top of BERT if we are low on computational resources. In other words, keep BERT's values fixed, but train one more layer that reads in the output of BERT instead. 

In [28]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,        
    lr=3e-5,
    eps=1e-8
)

We can then add a scheduler that reduces the learning rate as it goes through epochs of training.

In [29]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    
    num_training_steps=total_steps
)

Ok, now it's finally time to fit BERT for our NER task!

We'll start by defining some metrics which we'll keep track of while training. We'll keep it simnple and use the f1_score which is a binary classification scorer ---IDK IF WE CAN USE THIS SINCE WE'RE NOT CLASSIFYING BINARIES, BUT WE CAN LOOK INTO OPTIONS ---- We'll use simple accuracy on a token level.

In [30]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)/len(labels_flat)

God damn, ok, for real for real, lets train the model. The original BERT paper I think says 3-4 epochs of training.

In [42]:
#Store the average loss after each epoch to plot
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    #Perform one full pass over the training set
    
    #Put the model in training mode:
    model.train()
        # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        print(b_input_ids)
        print(b_input_mask)
        
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()    
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

tensor([[ 5572,   112,   188,  ...,     0,     0,     0],
        [ 4258, 10233,  1163,  ...,     0,     0,     0],
        [19597,   112,   188,  ...,     0,     0,     0],
        ...,
        [ 1135,  1108, 15833,  ...,     0,     0,     0],
        [ 1109,  1735,  1913,  ...,     0,     0,     0],
        [16409, 17786,  1116,  ...,     0,     0,     0]], dtype=torch.int32)
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)