In [2]:
import os
path = %pwd
if path.split(os.sep)[-1] == 'notebooks':
    %cd ..

In [3]:
%reload_ext autoreload
%autoreload 2

import numpy as np
from data_preprocessing import Get_and_process_data
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

In [4]:

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased"
# model_checkpoint = "giacomomiolo/electramed_base_scivocab_1M"
batch_size = 64
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [28]:
data_loader = Get_and_process_data(tokenizer, train_split=0.99, add_unlabeled=True)
D = data_loader.get_dataset()
label_list = data_loader.get_label_list()

Loading raw text: 100%|██████████| 170/170 [00:05<00:00, 32.07it/s]
Processing raw text: 170it [00:00, 205.25it/s]
Adding unlabeled lines: 8454it [00:00, 9591.17it/s] 
Formatting dataset: 16525it [00:02, 5901.93it/s]
Using custom data configuration default-72f20bdf7ea795cf


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-72f20bdf7ea795cf/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-72f20bdf7ea795cf/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/16249 [00:00<?, ?ex/s]

  0%|          | 0/165 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/json/default-72f20bdf7ea795cf/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-16858c9a1d555ced.arrow
Loading raw text for test: 100%|██████████| 128/128 [00:08<00:00, 14.28it/s]
Formatting test data: 14146it [00:00, 19796.17it/s]
Using custom data configuration default-9f8c2ce736b7b09f


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-9f8c2ce736b7b09f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-9f8c2ce736b7b09f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/14146 [00:00<?, ?ex/s]

In [29]:
# model = AutoModelForTokenClassification.from_pretrained("logs/model_60_warmup_epochs")

In [30]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

loading configuration file https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_

### Training

In [31]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"training_logs/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    # learning_rate=1e-5,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.05,
    logging_steps=5,

    warmup_ratio=0.1,
)
data_collator = DataCollatorForTokenClassification(tokenizer)


metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
trainer = Trainer(
    model,
    args,
    train_dataset=D["train"],
    eval_dataset=D["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [33]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: hypothetical_indices_end, test_indices_end, absent_indices_end, hypothetical_indices_start, associated_with_someone_else_indices_end, text, filename, test_indices_start, present, associated_with_someone_else_indices_start, treatment_indices_end, treatment_indices_start, possible_indices_start, test, hypothetical, absent_indices_start, present_indices_start, absent, row, conditional_indices_start, associated_with_someone_else, treatment, present_indices_end, possible_indices_end, conditional, conditional_indices_end, possible, offset_mapping.
***** Running training *****
  Num examples = 16249
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 5080


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5095,0.434549,0.512953,0.55,0.530831,0.864234
2,0.2319,0.226715,0.733333,0.733333,0.733333,0.924574
3,0.1694,0.182616,0.760638,0.794444,0.777174,0.943066
4,0.1153,0.18448,0.805405,0.827778,0.816438,0.947932
5,0.1009,0.181408,0.828877,0.861111,0.844687,0.950852
6,0.0723,0.190531,0.798942,0.838889,0.818428,0.945985
7,0.0451,0.194995,0.861111,0.861111,0.861111,0.953771
8,0.0702,0.21588,0.836158,0.822222,0.829132,0.952311
9,0.0266,0.211023,0.838889,0.838889,0.838889,0.952798
10,0.0393,0.230061,0.844444,0.844444,0.844444,0.952311


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: hypothetical_indices_end, test_indices_end, absent_indices_end, hypothetical_indices_start, associated_with_someone_else_indices_end, text, filename, test_indices_start, present, associated_with_someone_else_indices_start, treatment_indices_end, treatment_indices_start, possible_indices_start, test, hypothetical, absent_indices_start, present_indices_start, absent, row, conditional_indices_start, associated_with_someone_else, treatment, present_indices_end, possible_indices_end, conditional, conditional_indices_end, possible, offset_mapping.
***** Running Evaluation *****
  Num examples = 165
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to training_logs/scibert_scivocab_uncased-finetuned-ner/checkpoint-500
Configuration saved in training_logs/scibert_scivocab_uncased-finetuned-ner/checkpoint-5

TrainOutput(global_step=5080, training_loss=0.11468148054775056, metrics={'train_runtime': 1254.063, 'train_samples_per_second': 259.142, 'train_steps_per_second': 4.051, 'total_flos': 1.0417602575221668e+16, 'train_loss': 0.11468148054775056, 'epoch': 20.0})

In [34]:
# trainer.model = AutoModelForTokenClassification.from_pretrained("./training_logs/electramed_base_scivocab_1M-finetuned-ner/checkpoint-2500", local_files_only=True)
# trainer.model.to("cuda")

### Performence on validation

In [35]:
predictions, labels, _ = trainer.predict(D["val"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: hypothetical_indices_end, test_indices_end, absent_indices_end, hypothetical_indices_start, associated_with_someone_else_indices_end, text, filename, test_indices_start, present, associated_with_someone_else_indices_start, treatment_indices_end, treatment_indices_start, possible_indices_start, test, hypothetical, absent_indices_start, present_indices_start, absent, row, conditional_indices_start, associated_with_someone_else, treatment, present_indices_end, possible_indices_end, conditional, conditional_indices_end, possible, offset_mapping.
***** Running Prediction *****
  Num examples = 165
  Batch size = 64


{'ABSENT': {'precision': 0.8571428571428571,
  'recall': 0.9,
  'f1': 0.8780487804878048,
  'number': 20},
 'CONDITIONAL': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'POSSIBLE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'PRESENT': {'precision': 0.8679245283018868,
  'recall': 0.7419354838709677,
  'f1': 0.8,
  'number': 62},
 'TEST': {'precision': 0.9473684210526315,
  'recall': 0.9642857142857143,
  'f1': 0.9557522123893805,
  'number': 56},
 'TREATMENT': {'precision': 0.8222222222222222,
  'recall': 0.9487179487179487,
  'f1': 0.8809523809523809,
  'number': 39},
 'overall_precision': 0.88268156424581,
 'overall_recall': 0.8777777777777778,
 'overall_f1': 0.8802228412256267,
 'overall_accuracy': 0.9547445255474453}

In [36]:
trainer.save_model("logs/scibert_20_epochs_64_batch_99_train_split")

Saving model checkpoint to logs/scibert_20_epochs_64_batch_99_train_split
Configuration saved in logs/scibert_20_epochs_64_batch_99_train_split/config.json
Model weights saved in logs/scibert_20_epochs_64_batch_99_train_split/pytorch_model.bin
tokenizer config file saved in logs/scibert_20_epochs_64_batch_99_train_split/tokenizer_config.json
Special tokens file saved in logs/scibert_20_epochs_64_batch_99_train_split/special_tokens_map.json


# Reformat to the initial format

In [37]:
# prediction = np.load("../data/prediction.npy")
# print("min :", min(prediction.reshape(-1)))
# print("max :", max(prediction.reshape(-1)))
# prediction = predictions_2
test_data = D["test"]
prediction, _, _ = trainer.predict(D["test"])
prediction = np.argmax(prediction, axis=2)

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, row, offset_mapping, filename.
***** Running Prediction *****
  Num examples = 14146
  Batch size = 64


In [38]:
label_list
token_to_label = {token: token.split("-")[-1] for token in label_list}
token_id_to_label = {i: token_to_label[token].lower() for i, token in enumerate(label_list)}
token_id_to_label

{0: 'o',
 1: 'test',
 2: 'test',
 3: 'treatment',
 4: 'treatment',
 5: 'present',
 6: 'present',
 7: 'absent',
 8: 'absent',
 9: 'possible',
 10: 'possible',
 11: 'conditional',
 12: 'conditional',
 13: 'hypothetical',
 14: 'hypothetical',
 15: 'associated_with_someone_else',
 16: 'associated_with_someone_else'}

In [39]:
max(predictions.reshape(-1))

12

In [40]:
ast_to_concept = {
    "test" : "test",
    "treatment" : "treatment",
    "present" : "problem",
    "absent" : "problem",
    "possible" : "problem",
    "conditional" : "problem",
    "hypothetical" : "problem",
    "associated_with_someone_else" : "problem"
}

In [41]:
from utils.save_predictions import save_predictions

# save_predictions(test_data, prediction)
save_predictions(test_data, prediction)

### Viz

In [21]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [34]:
effect_ner_model(D["train"][4]["text"])

[{'entity': 'LABEL_1',
  'score': 0.99998295,
  'index': 1,
  'word': 'her',
  'start': 0,
  'end': 3},
 {'entity': 'LABEL_2',
  'score': 0.99998784,
  'index': 2,
  'word': 'coagulation',
  'start': 4,
  'end': 15},
 {'entity': 'LABEL_2',
  'score': 0.9999907,
  'index': 3,
  'word': 'parameters',
  'start': 16,
  'end': 26},
 {'entity': 'LABEL_2',
  'score': 0.9999912,
  'index': 4,
  'word': 'were',
  'start': 27,
  'end': 31},
 {'entity': 'LABEL_2',
  'score': 0.99999094,
  'index': 5,
  'word': 'normal',
  'start': 32,
  'end': 38},
 {'entity': 'LABEL_2',
  'score': 0.9999906,
  'index': 6,
  'word': '.',
  'start': 39,
  'end': 40}]

In [39]:
from spacy import displacy
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [44]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(D["train"]))
    visualize_entities(D["train"][index]["text"])
    print(f"Text: {D['train'][index]['text']}")
    # print(f"Problems: {D['D'][index]['problem']}")
    print(f"Tests: {D['train'][index]['test']}")
    print(f"Treatments: {D['train'][index]['treatment']}")
    print(f"Predent: {D['train'][index]['present']}")
    print(f"{'*' * 50}\n")

Text: at osh , sputum cultures grew pseudomonas and mrsa , and he was restarted on zosyn ( 08-08 ) and vancomycin ( 08-09 ).
Tests: ['sputum cultures']
Treatments: ['vancomycin', 'zosyn']
Predent: ['mrsa', 'pseudomonas']
**************************************************



Text: the liver was 11 cm by percussion .
Tests: ['percussion']
Treatments: []
Predent: []
**************************************************



Text: hepatitis 35 years ago .
Tests: []
Treatments: []
Predent: ['hepatitis']
**************************************************



Text: Fluid , electrolytes and nutritions :
Tests: []
Treatments: []
Predent: []
**************************************************



Text: toprol xl 25 mg qday
Tests: []
Treatments: ['toprol xl']
Predent: []
**************************************************

