In [190]:
# !pip install torch

In [191]:
import torch

In [192]:
torch.cuda.is_available()

True

In [193]:
# !pip install transformers
# !pip install accelerate
# !pip install evaluate
# !pip install seqeval

# # upgrade
# !pip install accelerate --upgrade

In [194]:
# !pip install datasets
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

In [195]:
import tensorflow as tf
from tensorflow import keras

In [196]:
import numpy as np

In [197]:
# !pip install --upgrade gensim

word2vec-google-news-300 is a pretrained model. Here are the key details about it:
- Model Type: Word2Vec
- Training Data: Google News dataset
- Vector Dimensions: 300
- Training Algorithm: Word2Vec (Continuous Bag of Words (CBOW) and Skip-gram models)

In [198]:
import codecs

In [199]:
# import gensim.downloader
# # Show all available models in gensim-data
# print(list(gensim.downloader.info()['models'].keys()))


In [200]:
# Download the "glove-twitter-25" embeddings
# gnews_vec = gensim.downloader.load('word2vec-google-news-300')

In [201]:
# #take a look at the word embedding of some words
# gnews_vec['king']

In [202]:
def load_dataset(file):
    with open (file, 'r') as file_read:
        lines = file_read.readlines()

    sentences = []
    sentence = []

    for line in lines:

        if line == "\n":
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            sentence.append(word)
    return sentences

In [203]:
train_data = load_dataset("eng.train")
val_data = load_dataset("eng.testa")
test_data= load_dataset("eng.testb")

In [204]:
#sort out all different types of BIO tags
tags_list = []
for sentence in train_data:
  for word in sentence:
    if word != '\n':
        tags_list.append(word[-1])

In [205]:
label_set = list(set(tags_list))

In [206]:
label_set

['I-MISC', 'B-LOC', 'I-PER', 'B-MISC', 'I-ORG', 'B-ORG', 'I-LOC', 'O']

In [207]:
label_to_id_dict = {label:i for i, label in enumerate(label_set)}
id_to_label_dict = {value:key for key, value in label_to_id_dict.items()}

In [208]:
id_to_label_dict

{0: 'I-MISC',
 1: 'B-LOC',
 2: 'I-PER',
 3: 'B-MISC',
 4: 'I-ORG',
 5: 'B-ORG',
 6: 'I-LOC',
 7: 'O'}

In [209]:
#from text file, extract tokens and labels (converted into nums)
def extract_tokens_labels (content, label_dict):
    extract_dict = {"tokens":[], "labels":[]}
    # a single sentence as a value
    for sentence in content:
        sentence_words = []
        sent_label = []
        for word in sentence:
            sentence_words.append(word[0])
            sent_label.append(label_dict[word[-1]])
        extract_dict["tokens"].append(sentence_words)
        extract_dict["labels"].append(sent_label)
    return Dataset.from_dict(extract_dict)

In [210]:
train_dataset = extract_tokens_labels(train_data, label_to_id_dict)
val_dataset = extract_tokens_labels(val_data, label_to_id_dict)
test_dataset = extract_tokens_labels(test_data, label_to_id_dict)

In [211]:
datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

In [212]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_set))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [213]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [214]:
#Here, for the sub tokens that come from the same word, we just assign -100, so that they don't contribute heavily to loss
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    #after tokenizing, there might be more tokens than before tokenizing. Here, we align tokens and labels

    labels = []
    #the labels corresponding to the tokens
    for i, label in enumerate(examples["labels"]):
        #word_ids return the position of the words in the original sentence.multiple tokens could come from the same position after being tokenized into sub tokens
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # meaning that it could be a special token: [CLS] (start of sentence), [SEP] (end of sentence), [PAD]
            if word_idx is None:
                label_ids.append(-100)
            #the sub-token doesn't come from the the same word as the previous sub-token, then we just take the corresponding label (look at the position of the word)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            #the sub-token comes from the the same word as the previous sub-token, append -100 as label_id to label
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [215]:
datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [216]:
metric = evaluate.load("seqeval")

In [217]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_set[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_set[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [218]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer, padding = 'longest', return_tensors = 'pt')

In [219]:
train_dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 14041
})

In [220]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [221]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)



In [222]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0448,0.036302,0.934005,0.936048,0.935026,0.989954
2,0.0195,0.036507,0.948993,0.951868,0.950428,0.991881
3,0.0115,0.035312,0.949431,0.954224,0.951821,0.992232


TrainOutput(global_step=5268, training_loss=0.03771038791313258, metrics={'train_runtime': 1171.8069, 'train_samples_per_second': 35.947, 'train_steps_per_second': 4.496, 'total_flos': 2694549004634928.0, 'train_loss': 0.03771038791313258, 'epoch': 3.0})

In [223]:
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/jh-hoo/bert-finetuned-ner/commit/4d995e16b7fa4f629b5ac6ba5f8b8335a0f62afd', commit_message='Training complete', commit_description='', oid='4d995e16b7fa4f629b5ac6ba5f8b8335a0f62afd', pr_url=None, pr_revision=None, pr_num=None)

In [237]:
trainer.evaluate(eval_dataset = datasets["test"])

{'eval_loss': 0.13545821607112885,
 'eval_precision': 0.9064082416623014,
 'eval_recall': 0.9190864022662889,
 'eval_f1': 0.9127032967032966,
 'eval_accuracy': 0.9830300419941854,
 'eval_runtime': 27.3561,
 'eval_samples_per_second': 126.224,
 'eval_steps_per_second': 15.792,
 'epoch': 3.0}

In [235]:
# #Using evaluator to evaluate on test set, there might be a certain format for test dataset to follow
# from evaluate import evaluator
# task_evaluator = evaluator("token-classification")

# results = task_evaluator.compute(
#     model_or_pipeline="jh-hoo/bert-finetuned-ner",
#     data = test_eval,
#     metric="seqeval",
# )

NotImplementedError: References provided as integers, but the reference column is not a Sequence of ClassLabels.

In [224]:
# dealing with BERT tokenizer `
# https://discuss.huggingface.co/t/how-to-deal-with-differences-between-conll-2003-dataset-tokenisation-and-ber-tokeniser-when-fine-tuning-ner-model/11129

In [225]:
# labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
# #reffering to labels array, for those filled with num != -100, it will return the content of the label array
# #for filled with num == -100, it will return -100 back

In [226]:
# tokenizer

In [227]:
# print(tokenizer.cls_token)

**metrics: look into individual class and interpret**