In [1]:
import sys
sys.path.apppend('../util')
sys.path.append('../experiments')

import os
# Disable weights and biases (if installed)
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
from pathlib import Path
import transformers
import datasets
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline, DataCollatorForTokenClassification, EarlyStoppingCallback, trainer_utils
from huggingface_utils import load_custom_dataset, LabelAligner, compute_metrics, eval_on_test_set
from run_experiment import get_train_args
from convert_annotations import entity_values

In [3]:
datasets.logging.set_verbosity_error()
transformers.logging.disable_default_handler()

# Parameters

In [4]:
level = 'fine' # Change to 'coarse' to look at high-level entity classes only
spans = 'long' # Change to 'short' to consider short spans ignoring specifications

In [5]:
config_files = {
    ('coarse' , 'short') : '01_ggponc_coarse_short.yaml',
    ('fine', 'short') : '02_ggponc_fine_short.yaml',
    ('coarse' , 'long' ) : '03_ggponc_coarse_long.yaml',
    ('fine', 'long' ) : '04_ggponc_fine_long.yaml'
}

In [6]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..') / 'experiments', job_name='foo')
config = compose(config_name=config_files[(level, spans)], overrides=['cuda=0', 'link=false'])

In [7]:
train_file = config['train_dataset']
dev_file = config['dev_dataset']
test_file = config['test_dataset']

# Setup IOB-encoded dataset with train / dev / test splits

In [8]:
dataset, tags = load_custom_dataset(train=train_file, dev=dev_file, test=test_file, tag_strings=config['task'])

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(config['base_model_checkpoint'])
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [10]:
label_aligner = LabelAligner(tokenizer)

In [11]:
dataset = dataset.map(lambda e: label_aligner.tokenize_and_align_labels(e, config['label_all_tokens']), batched=True)

  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

In [12]:
id2label = dict(enumerate(tags))
id2label

{0: 'O',
 1: 'B-Other_Finding',
 2: 'I-Other_Finding',
 3: 'B-Diagnosis_or_Pathology',
 4: 'I-Diagnosis_or_Pathology',
 5: 'B-Therapeutic',
 6: 'I-Therapeutic',
 7: 'B-Diagnostic',
 8: 'I-Diagnostic',
 9: 'B-Nutrient_or_Body_Substance',
 10: 'I-Nutrient_or_Body_Substance',
 11: 'B-External_Substance',
 12: 'I-External_Substance',
 13: 'B-Clinical_Drug',
 14: 'I-Clinical_Drug'}

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_tags', 'attention_mask', 'fname', 'input_ids', 'labels', 'offset_mapping', 'sentence_id', 'special_tokens_mask', 'tags', 'token_type_ids', 'tokens'],
        num_rows: 59515
    })
    dev: Dataset({
        features: ['_tags', 'attention_mask', 'fname', 'input_ids', 'labels', 'offset_mapping', 'sentence_id', 'special_tokens_mask', 'tags', 'token_type_ids', 'tokens'],
        num_rows: 12770
    })
    test: Dataset({
        features: ['_tags', 'attention_mask', 'fname', 'input_ids', 'labels', 'offset_mapping', 'sentence_id', 'special_tokens_mask', 'tags', 'token_type_ids', 'tokens'],
        num_rows: 13714
    })
})

# Configure and train 🤗 token classification model

In [14]:
from run_experiment import get_train_args

In [15]:
num_train_epochs = 10 # Remove this line to train for default value of 100 epochs

In [16]:
config['num_train_epochs'] = num_train_epochs

In [17]:
training_args = get_train_args(cp_path='../ner_results', run_name='ner_baseline', report_to=[], **config, resume_from_checkpoint=None)

INFO:run_experiment:ner_baseline


In [18]:
def model_init():
    return AutoModelForTokenClassification.from_pretrained(
        config['base_model_checkpoint'],
        num_labels=len(tags), 
        id2label=id2label,
    )

data_collator = DataCollatorForTokenClassification(tokenizer)
tr = Trainer(
    args=training_args,
    model_init=model_init,
    train_dataset=dataset["train"],
    eval_dataset=dataset["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics(tags, True),
)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gb

In [19]:
train_result = tr.train()

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gb

Epoch,Training Loss,Validation Loss,Clinical Drug Precision,Clinical Drug Recall,Clinical Drug F1,Clinical Drug Number,Diagnosis Or Pathology Precision,Diagnosis Or Pathology Recall,Diagnosis Or Pathology F1,Diagnosis Or Pathology Number,Diagnostic Precision,Diagnostic Recall,Diagnostic F1,Diagnostic Number,External Substance Precision,External Substance Recall,External Substance F1,External Substance Number,Nutrient Or Body Substance Precision,Nutrient Or Body Substance Recall,Nutrient Or Body Substance F1,Nutrient Or Body Substance Number,Other Finding Precision,Other Finding Recall,Other Finding F1,Other Finding Number,Therapeutic Precision,Therapeutic Recall,Therapeutic F1,Therapeutic Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.2846,0.271811,0.635674,0.705804,0.668906,2257,0.69331,0.753041,0.721942,9042,0.632189,0.688016,0.658922,4122,0.422819,0.366279,0.392523,172,0.543046,0.449726,0.492,547,0.525882,0.55279,0.539001,6469,0.648824,0.743763,0.693057,7415,0.630842,0.687383,0.6579,0.906396
2,0.2114,0.274428,0.640379,0.719539,0.677655,2257,0.710551,0.766423,0.73743,9042,0.629046,0.716642,0.669993,4122,0.556962,0.255814,0.350598,172,0.476812,0.601463,0.531932,547,0.547244,0.584634,0.565321,6469,0.662788,0.760486,0.708284,7415,0.642149,0.709499,0.674146,0.90805
3,0.1546,0.290193,0.662226,0.682765,0.672339,2257,0.730948,0.749945,0.740324,9042,0.621566,0.729985,0.671427,4122,0.514851,0.302326,0.380952,172,0.528571,0.541133,0.534779,547,0.547465,0.600866,0.572924,6469,0.680358,0.759541,0.717772,7415,0.652567,0.706035,0.678249,0.908383
4,0.1103,0.34045,0.654647,0.72397,0.687566,2257,0.735212,0.753263,0.744128,9042,0.64641,0.725133,0.683512,4122,0.444444,0.325581,0.375839,172,0.481013,0.555759,0.515691,547,0.546799,0.628536,0.584826,6469,0.679458,0.771274,0.722461,7415,0.654008,0.718725,0.684841,0.908707
5,0.0816,0.378975,0.622974,0.766504,0.687326,2257,0.723618,0.764433,0.743466,9042,0.653846,0.713489,0.682367,4122,0.396825,0.290698,0.33557,172,0.535406,0.566728,0.550622,547,0.548034,0.62699,0.584859,6469,0.70356,0.732974,0.717966,7415,0.656317,0.713896,0.683896,0.907379
6,0.0593,0.405382,0.653743,0.754541,0.700535,2257,0.722981,0.772395,0.746872,9042,0.691792,0.701359,0.696543,4122,0.444444,0.372093,0.405063,172,0.479109,0.628885,0.543874,547,0.555172,0.622198,0.586777,6469,0.698942,0.757384,0.72699,7415,0.663588,0.720324,0.690793,0.910435
7,0.0436,0.447875,0.644928,0.749225,0.693175,2257,0.725029,0.773059,0.748274,9042,0.669247,0.713246,0.690546,4122,0.410256,0.27907,0.33218,172,0.542744,0.499086,0.52,547,0.555601,0.62328,0.587498,6469,0.696446,0.76116,0.727366,7415,0.662377,0.720024,0.689999,0.909418
8,0.0336,0.484566,0.662072,0.744794,0.701001,2257,0.733375,0.784229,0.75795,9042,0.676491,0.718341,0.696788,4122,0.433071,0.319767,0.367893,172,0.529412,0.559415,0.544,547,0.561578,0.618179,0.588521,6469,0.708806,0.757653,0.732416,7415,0.671398,0.723122,0.696301,0.911331
9,0.0249,0.517462,0.664671,0.737705,0.699286,2257,0.729002,0.784229,0.755608,9042,0.664149,0.725376,0.693414,4122,0.460177,0.302326,0.364912,172,0.539898,0.581353,0.559859,547,0.572106,0.615087,0.592819,6469,0.70606,0.757384,0.730822,7415,0.671232,0.723122,0.696211,0.910789
10,0.0178,0.543252,0.663473,0.736376,0.698026,2257,0.737875,0.777372,0.757109,9042,0.667189,0.722707,0.69384,4122,0.460177,0.302326,0.364912,172,0.534031,0.559415,0.546429,547,0.564251,0.619725,0.590688,6469,0.707669,0.760351,0.733065,7415,0.67201,0.721922,0.696072,0.910819


# Evaluate Model

In [20]:
model = tr.model

In [21]:
from transformers.pipelines.token_classification import AggregationStrategy

In [22]:
test_metrics = eval_on_test_set(dataset["test"], tr, tokenizer, "test")

13714it [00:05, 2515.14it/s]


In [23]:
print(f"""
F1: {test_metrics["test/overall_f1"]:.2f}
 P: {test_metrics["test/overall_precision"]:.2f}
 R: {test_metrics["test/overall_recall"]:.2f}
""")


F1: 0.71
 P: 0.69
 R: 0.73



### Detailed analysis of model performance

See notebook: [03_NER_Analysis](03_NER_Analysis.ipynb)

In [24]:
p = pipeline("ner", model, tokenizer=tokenizer, device=0, aggregation_strategy=AggregationStrategy.FIRST)

In [29]:
p('Im perioperativen Outcome (Blutverlust, stationärer Aufenthalt, lokale Tumorkontrolle (allerdings kurzes Follow-up von 1 Jahr) wurden keine Unterschiede festgestellt.')

[{'entity_group': 'Other_Finding',
  'score': 0.961157,
  'word': 'perioperativen Outcome',
  'start': 3,
  'end': 25},
 {'entity_group': 'Diagnosis_or_Pathology',
  'score': 0.9992925,
  'word': 'Blutverlust',
  'start': 27,
  'end': 38},
 {'entity_group': 'Other_Finding',
  'score': 0.7595823,
  'word': 'lokale Tumorkontrolle',
  'start': 64,
  'end': 85},
 {'entity_group': 'Other_Finding',
  'score': 0.6436763,
  'word': '-',
  'start': 111,
  'end': 112}]