In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv("training_spacy.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)


Unnamed: 0,Sentence #,Word,POS,Tag
2580034,Sentence: 18896,achieve,VB,O
2580035,Sentence: 18896,better,JJR,O
2580036,Sentence: 18896,urban,JJ,O
2580037,Sentence: 18896,management,NN,O
2580038,Sentence: 18896,and,CC,O
2580039,Sentence: 18896,developmental,JJ,O
2580040,Sentence: 18896,programs,NNS,O
2580041,Sentence: 18896,for,IN,O
2580042,Sentence: 18896,Kathmandu,NNP,A
2580043,Sentence: 18896,.,.,O


In [2]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [3]:
getter = SentenceGetter(data)

In [4]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

['Architecturally',
 ',',
 'the',
 'school',
 'has',
 'a',
 'Catholic',
 'character',
 '.',
 'Atop',
 'the',
 'Main',
 'Building',
 "'s",
 'gold',
 'dome',
 'is',
 'a',
 'golden',
 'statue',
 'of',
 'the',
 'Virgin',
 'Mary',
 '.',
 'Immediately',
 'in',
 'front',
 'of',
 'the',
 'Main',
 'Building',
 'and',
 'facing',
 'it',
 ',',
 'is',
 'a',
 'copper',
 'statue',
 'of',
 'Christ',
 'with',
 'arms',
 'upraised',
 'with',
 'the',
 'legend',
 '"',
 'Venite',
 'Ad',
 'Me',
 'Omnes',
 '"',
 '.',
 'Next',
 'to',
 'the',
 'Main',
 'Building',
 'is',
 'the',
 'Basilica',
 'of',
 'the',
 'Sacred',
 'Heart',
 '.',
 'Immediately',
 'behind',
 'the',
 'basilica',
 'is',
 'the',
 'Grotto',
 ',',
 'a',
 'Marian',
 'place',
 'of',
 'prayer',
 'and',
 'reflection',
 '.',
 'It',
 'is',
 'a',
 'replica',
 'of',
 'the',
 'grotto',
 'at',
 'Lourdes',
 ',',
 'France',
 'where',
 'the',
 'Virgin',
 'Mary',
 'reputedly',
 'appeared',
 'to',
 'Saint',
 'Bernadette',
 'Soubirous',
 'in',
 '1858',
 '.',
 'At

In [5]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'A', 'O', 'O', 'O', 'A', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'O', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'O', 'O', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'O', 'A', 'A', 'A', 'O', 'O', 'O', 'A', 'A', 'A', 'O', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'O', 'O', 'A', 'A', 'A', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

'1.7.1+cu110'

In [8]:
MAX_LEN = 75
bs = 32

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1080 Ti'

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [12]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [13]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [14]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [15]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [16]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]


In [17]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [18]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [19]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

#### Setup the Bert model for finetuning

In [7]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__


'2.6.0'

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [24]:
model.cuda();

In [25]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [26]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### Fit BERT for named entity recognition

In [27]:
from seqeval.metrics import f1_score, accuracy_score
from sklearn import metrics

In [30]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    pred ,val= [],[]
    for a,b in zip(pred_tags,valid_tags):
        if a == 'A':
            pred.append(1)
        else:
            pred.append(0)
        if b == 'A':
            val.append(1)
        else:
            val.append(0)
    print("pred_tags = ",len(pred_tags))
    print("valid_tags = ",len(valid_tags))
    print("Precision score:{}".format(metrics.precision_score(val,pred)))
    print("Recall score :{}".format(metrics.recall_score(val,pred)))
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Average train loss: 0.13756529138864657
Validation loss: 0.1079263641188542
pred_tags =  139019
valid_tags =  139019
Precision score:0.845022335673261


Epoch:  33%|███▎      | 1/3 [02:05<04:11, 125.58s/it]

Recall score :0.9467681967681968
Validation Accuracy: 0.954351563455355

Average train loss: 0.0941822258265395
Validation loss: 0.10190932614107927
pred_tags =  139019
valid_tags =  139019
Precision score:0.8495690614763223


Epoch:  67%|██████▋   | 2/3 [04:12<02:06, 126.28s/it]

Recall score :0.954990704990705
Validation Accuracy: 0.9569195577582921

Average train loss: 0.07862806848173302
Validation loss: 0.10676337679227194
pred_tags =  139019
valid_tags =  139019
Precision score:0.8622517423027845


Epoch: 100%|██████████| 3/3 [06:20<00:00, 126.71s/it]

Recall score :0.9421206921206922
Validation Accuracy: 0.9580704795747343






In [44]:
pred_tags,valid_tags = ['A','O','A','O'],['A','A','A','A']
pred ,val= [],[]
for a,b in zip(pred_tags,valid_tags):
    if a == 'A':
        pred.append(1)
    else:
        pred.append(0)
    if b == 'A':
        val.append(1)
    else:
        val.append(0)
print("pred = ",pred)
print("val = ",val)
print("Precision score:{}".format(metrics.precision_score(val,pred)))
print("Recall score :{}".format(metrics.recall_score(val,pred)))

pred =  [1, 0, 1, 0]
val =  [1, 1, 1, 1]
Precision score:1.0
Recall score :0.5


### Evaluate

In [1]:
import json
import csv
import spacy
from spacy.tokenizer import Tokenizer
import re
from tqdm import tqdm
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [2]:
def search(tokens,t):
    flag = False
    indices = [i for i, x in enumerate(tokens) if x == t[0]]
    #print(indices)
    try:
        for index in indices:
            num = 0
            # print("t = {},tokens = {}".format(t,tokens))
            if index + len(t) < len(tokens):
                for j in range(len(t)):
                    if t[j] == tokens[index + j]:
                        num += 1
                        
                if num == len(t):
                    return index
        
    except ValueError:
        pass
    return -1

In [3]:
import json
import csv
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import pandas as pd
from pandas.core.frame import DataFrame
import string

nlp = spacy.load("en_core_web_sm")
with open('dev-v1.1.json','r',encoding="utf-8") as readfile:
    article = json.load(readfile)
    
sentence_num = 0
valid_tokens = []
valid_labels = []
for d in tqdm(article['data'][:1]):
    for para in d['paragraphs']:
#         print(para['context'])
        sentence_num += 1
        doc = nlp(para['context'])
        tokens = [t0.text for t0 in doc] #文章斷詞結果
        tags = ['O' for i in range(len(tokens))] #是否適合作為答案
        pos = [token.tag_ for token in doc]

        ents = []
        tuple = []
        dict = {}
        #squad
        for qa in para['qas']:
            for ans in qa['answers']:
                if ans['answer_start'] not in dict.keys():
                    a = ans['answer_start']
                    dict[a] = ans['text']
                else:
                    if len(dict[ans['answer_start']]) < len(ans['text']):
                        a = ans['answer_start']
                        dict[a] = ans['text']
        for i in sorted(dict):
            tuple.append((i,dict[i]))   


        start = 0
        for i in range(len(tuple)):
            word = nlp(tuple[i][1])
            token = [t.text for t in word]
            indices = [i for i, x in enumerate(tokens) if x == token[0] and i>=start]

            final_index = 0
            for index in indices:
                l = 0
                for j in range(len(token)):
                    if index+j >= len(tokens):
                        break
                    if token[j] == tokens[index + j]:
                        l += 1
                if l == len(token):
                    final_index = index
                    break
            for j in range(len(token)):
                tags[final_index + j] = 'A'
            start = final_index

        #spacy
        ents = [e.text for e in doc.ents] #關鍵字
        tags2 = ['O' for i in range(len(tokens))] #是否適合作為答案
        start = 0
        for i in range(len(ents)):
            word = nlp(ents[i])
            token = [t.text for t in word]
            indices = [i for i, x in enumerate(tokens) if x == token[0] and i>=start]
#                 for j in range(len(indices),0):
#                     if indices[j] <start:
#                         indices.pop(j)
            final_index = 0
            for index in indices:
                l = 0
                for j in range(len(token)):
                    if token[j] == tokens[index + j]:
                        l += 1
                if l == len(token):
                    final_index = index
                    break
            for j in range(len(token)):
                tags2[final_index + j] = 'A'
            start = final_index     

        #Combine
        tag_final = []
        for i in range(len(tags)):
            if tags[i] == 'A' or tags2[i] == 'A':
                tag_final.append('A')
            else:
                tag_final.append('O')
        valid_labels.extend(tag_final)
        valid_tokens.extend(tokens)
z = {"tokens":valid_tokens,"tags":valid_labels}
df = pd.DataFrame(z)
pd.set_option("display.max_rows", None, "display.max_columns", None)
# print(df)
print(len(valid_labels),len(valid_tokens))


100%|██████████| 1/1 [00:05<00:00,  5.04s/it]

6656 6656





### Predict

In [5]:
import spacy     
import string 
import csv                                            
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from time import sleep
import nltk
import argparse
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import joblib
import json

parser = argparse.ArgumentParser()

data = pd.read_csv("training_spacy.csv", encoding="latin1").fillna(method="ffill")
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")
tag_values.sort()
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag_values)
print(tag2idx)



MAX_LEN = 75
bs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
filename = 'finalized_model.sav'
model = joblib.load(filename)
model.to(device)
model.eval()
sen_num = 1
result = {}
result['data'] = []
predict_tokens = []
predict_labels = []
with open('dev-v1.1.json' ,'r',encoding = 'utf-8') as readfile:
    article = json.load(readfile)

sentence_num = 0
for p in tqdm(article['data'][:1]):
    for ar in p['paragraphs']:
#         print(ar['context'])
        temp = {}
        temp_tokens = []
        temp_labels = []
        sentence_num += 1
        context = ar['context']
        sentence = nltk.sent_tokenize(context)
        temp['context'] = ar['context']

        for s in sentence:
            new_tokens, new_labels = [], []
            tokenized_sentence = tokenizer.encode(s)
            input_ids = torch.tensor([tokenized_sentence]).cuda()
            with torch.no_grad():
                output = model(input_ids)
            label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
            # join bpe split tokens
            tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
            for token, label_idx in zip(tokens, label_indices[0]):
                if token.startswith("##"):
                    new_tokens[-1] = new_tokens[-1] + token[2:]
                else:
                    new_labels.append(tag_values[label_idx])
                    new_tokens.append(token)
            remove_index = [len(new_tokens)-1,0]
            for i in remove_index:
                new_tokens.pop(i)
                new_labels.pop(i)
            # for token, label in zip(new_tokens, new_labels):
            #     print("{}\t{}".format(label, token))
            temp_tokens.extend(new_tokens)
            temp_labels.extend(new_labels)
        predict_tokens.extend(temp_tokens)
        predict_labels.extend(temp_labels)
z = {"tokens":predict_tokens,"tags":predict_labels}
df = pd.DataFrame(z)
pd.set_option("display.max_rows", None, "display.max_columns", None)
# print(df)
print(len(predict_tokens),len(predict_labels))

['A', 'O', 'PAD']
{'A': 0, 'O': 1, 'PAD': 2}
GeForce GTX 1080 Ti


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

ModuleNotFoundError: No module named 'transformers.models'

#### Visualize the training loss

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(validation_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()