In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
from data_preprocessing import Get_and_process_data
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

In [2]:

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased"
# model_checkpoint = "giacomomiolo/electramed_base_scivocab_1M"
batch_size = 64
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
data_loader = Get_and_process_data(tokenizer, train_split=0.95, add_unlabeled=True)
D = data_loader.get_dataset()
label_list = data_loader.get_label_list()

Loading raw text: 100%|██████████| 170/170 [00:05<00:00, 31.92it/s]
Processing raw text: 170it [00:00, 208.56it/s]
Adding unlabeled lines: 8454it [00:00, 10016.73it/s]
Formatting dataset: 16525it [00:02, 6059.84it/s]
Using custom data configuration default-958a72c76e9cc4be


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-958a72c76e9cc4be/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-958a72c76e9cc4be/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15593 [00:00<?, ?ex/s]

  0%|          | 0/821 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/json/default-958a72c76e9cc4be/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-97e9e0a61cbfadce.arrow
Loading raw text for test: 100%|██████████| 128/128 [00:08<00:00, 14.79it/s]
Formatting test data: 14146it [00:00, 20424.61it/s]
Using custom data configuration default-c62dbd7f379fcfdc


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-c62dbd7f379fcfdc/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-c62dbd7f379fcfdc/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/14146 [00:00<?, ?ex/s]

In [4]:
# model = AutoModelForTokenClassification.from_pretrained("logs/model_60_warmup_epochs")

In [5]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

### Training

In [6]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"training_logs/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    # learning_rate=1e-5,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.05,
    logging_steps=5,

    warmup_ratio=0.1,
)
data_collator = DataCollatorForTokenClassification(tokenizer)


metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [7]:
trainer = Trainer(
    model,
    args,
    train_dataset=D["train"],
    eval_dataset=D["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [8]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: treatment, conditional_indices_end, associated_with_someone_else, absent, possible_indices_start, filename, hypothetical_indices_start, present, test_indices_start, associated_with_someone_else_indices_start, offset_mapping, treatment_indices_start, absent_indices_end, hypothetical_indices_end, treatment_indices_end, associated_with_someone_else_indices_end, possible, row, hypothetical, present_indices_start, present_indices_end, test_indices_end, test, absent_indices_start, conditional, text, conditional_indices_start, possible_indices_end.
***** Running training *****
  Num examples = 15593
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4880


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5184,0.405161,0.577297,0.581066,0.579176,0.881732
2,0.2606,0.1928,0.701736,0.747552,0.72392,0.939671
3,0.1319,0.15253,0.769866,0.811752,0.790254,0.953509
4,0.0899,0.148893,0.793031,0.817193,0.80493,0.953808
5,0.0817,0.138764,0.825532,0.844396,0.834857,0.959881
6,0.0745,0.142105,0.827623,0.841132,0.834323,0.960279
7,0.0912,0.157457,0.825212,0.847661,0.836286,0.960876
8,0.0364,0.160283,0.829268,0.850925,0.839957,0.960677
9,0.0483,0.171127,0.829268,0.850925,0.839957,0.961872
10,0.0416,0.172824,0.841151,0.858542,0.849758,0.962867


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: treatment, conditional_indices_end, associated_with_someone_else, absent, possible_indices_start, filename, hypothetical_indices_start, present, test_indices_start, associated_with_someone_else_indices_start, offset_mapping, treatment_indices_start, absent_indices_end, hypothetical_indices_end, treatment_indices_end, associated_with_someone_else_indices_end, possible, row, hypothetical, present_indices_start, present_indices_end, test_indices_end, test, absent_indices_start, conditional, text, conditional_indices_start, possible_indices_end.
***** Running Evaluation *****
  Num examples = 821
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: treatment, conditional_indices_end, ass

TrainOutput(global_step=4880, training_loss=0.12683728897531868, metrics={'train_runtime': 1215.6668, 'train_samples_per_second': 256.534, 'train_steps_per_second': 4.014, 'total_flos': 9950340615467940.0, 'train_loss': 0.12683728897531868, 'epoch': 20.0})

In [9]:
# trainer.model = AutoModelForTokenClassification.from_pretrained("./training_logs/electramed_base_scivocab_1M-finetuned-ner/checkpoint-2500", local_files_only=True)
# trainer.model.to("cuda")

### Performence on validation

In [10]:
predictions, labels, _ = trainer.predict(D["val"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: treatment, conditional_indices_end, associated_with_someone_else, absent, possible_indices_start, filename, hypothetical_indices_start, present, test_indices_start, associated_with_someone_else_indices_start, offset_mapping, treatment_indices_start, absent_indices_end, hypothetical_indices_end, treatment_indices_end, associated_with_someone_else_indices_end, possible, row, hypothetical, present_indices_start, present_indices_end, test_indices_end, test, absent_indices_start, conditional, text, conditional_indices_start, possible_indices_end.
***** Running Prediction *****
  Num examples = 821
  Batch size = 64


{'ABSENT': {'precision': 0.9452054794520548,
  'recall': 0.9078947368421053,
  'f1': 0.9261744966442953,
  'number': 76},
 'CONDITIONAL': {'precision': 0.6666666666666666,
  'recall': 1.0,
  'f1': 0.8,
  'number': 2},
 'HYPOTHETICAL': {'precision': 1.0,
  'recall': 0.8181818181818182,
  'f1': 0.9,
  'number': 11},
 'POSSIBLE': {'precision': 0.6666666666666666,
  'recall': 0.4444444444444444,
  'f1': 0.5333333333333333,
  'number': 18},
 'PRESENT': {'precision': 0.7803030303030303,
  'recall': 0.8408163265306122,
  'f1': 0.8094302554027505,
  'number': 245},
 'TEST': {'precision': 0.9066666666666666,
  'recall': 0.85,
  'f1': 0.8774193548387097,
  'number': 320},
 'TREATMENT': {'precision': 0.8215613382899628,
  'recall': 0.8947368421052632,
  'f1': 0.8565891472868217,
  'number': 247},
 'overall_precision': 0.8462365591397849,
 'overall_recall': 0.8563656147986942,
 'overall_f1': 0.8512709572742022,
 'overall_accuracy': 0.9611747137879542}

In [11]:
trainer.save_model("logs/scibert_20_epochs_64_batch_95_train_split")

Saving model checkpoint to logs/scibert_20_epochs_64_batch_95_train_split
Configuration saved in logs/scibert_20_epochs_64_batch_95_train_split/config.json
Model weights saved in logs/scibert_20_epochs_64_batch_95_train_split/pytorch_model.bin
tokenizer config file saved in logs/scibert_20_epochs_64_batch_95_train_split/tokenizer_config.json
Special tokens file saved in logs/scibert_20_epochs_64_batch_95_train_split/special_tokens_map.json


# Reformat to the initial format

In [12]:
# prediction = np.load("../data/prediction.npy")
# print("min :", min(prediction.reshape(-1)))
# print("max :", max(prediction.reshape(-1)))
# prediction = predictions_2
test_data = D["test"]
prediction, _, _ = trainer.predict(D["test"])
prediction = np.argmax(prediction, axis=2)

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, row, filename, text.
***** Running Prediction *****
  Num examples = 14146
  Batch size = 64


In [13]:
label_list
token_to_label = {token: token.split("-")[-1] for token in label_list}
token_id_to_label = {i: token_to_label[token].lower() for i, token in enumerate(label_list)}
token_id_to_label

{0: 'o',
 1: 'test',
 2: 'test',
 3: 'treatment',
 4: 'treatment',
 5: 'present',
 6: 'present',
 7: 'absent',
 8: 'absent',
 9: 'possible',
 10: 'possible',
 11: 'conditional',
 12: 'conditional',
 13: 'hypothetical',
 14: 'hypothetical',
 15: 'associated_with_someone_else',
 16: 'associated_with_someone_else'}

In [14]:
max(predictions.reshape(-1))

14

In [15]:
ast_to_concept = {
    "test" : "test",
    "treatment" : "treatment",
    "present" : "problem",
    "absent" : "problem",
    "possible" : "problem",
    "conditional" : "problem",
    "hypothetical" : "problem",
    "associated_with_someone_else" : "problem"
}

In [16]:
from utils.save_predictions import save_predictions

# save_predictions(test_data, prediction)
save_predictions(test_data, prediction)

### Viz

In [32]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [34]:
effect_ner_model(D["train"][4]["text"])

[{'entity': 'LABEL_1',
  'score': 0.99998295,
  'index': 1,
  'word': 'her',
  'start': 0,
  'end': 3},
 {'entity': 'LABEL_2',
  'score': 0.99998784,
  'index': 2,
  'word': 'coagulation',
  'start': 4,
  'end': 15},
 {'entity': 'LABEL_2',
  'score': 0.9999907,
  'index': 3,
  'word': 'parameters',
  'start': 16,
  'end': 26},
 {'entity': 'LABEL_2',
  'score': 0.9999912,
  'index': 4,
  'word': 'were',
  'start': 27,
  'end': 31},
 {'entity': 'LABEL_2',
  'score': 0.99999094,
  'index': 5,
  'word': 'normal',
  'start': 32,
  'end': 38},
 {'entity': 'LABEL_2',
  'score': 0.9999906,
  'index': 6,
  'word': '.',
  'start': 39,
  'end': 40}]

In [39]:
from spacy import displacy
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [44]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(D["train"]))
    visualize_entities(D["train"][index]["text"])
    print(f"Text: {D['train'][index]['text']}")
    # print(f"Problems: {D['D'][index]['problem']}")
    print(f"Tests: {D['train'][index]['test']}")
    print(f"Treatments: {D['train'][index]['treatment']}")
    print(f"Predent: {D['train'][index]['present']}")
    print(f"{'*' * 50}\n")

Text: at osh , sputum cultures grew pseudomonas and mrsa , and he was restarted on zosyn ( 08-08 ) and vancomycin ( 08-09 ).
Tests: ['sputum cultures']
Treatments: ['vancomycin', 'zosyn']
Predent: ['mrsa', 'pseudomonas']
**************************************************



Text: the liver was 11 cm by percussion .
Tests: ['percussion']
Treatments: []
Predent: []
**************************************************



Text: hepatitis 35 years ago .
Tests: []
Treatments: []
Predent: ['hepatitis']
**************************************************



Text: Fluid , electrolytes and nutritions :
Tests: []
Treatments: []
Predent: []
**************************************************



Text: toprol xl 25 mg qday
Tests: []
Treatments: ['toprol xl']
Predent: []
**************************************************

