In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
from data_preprocessing import Get_and_process_data
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

In [2]:

task = "ner" # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "allenai/scibert_scivocab_uncased"
model_checkpoint = "giacomomiolo/electramed_base_scivocab_1M"
batch_size = 32
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
data_loader = Get_and_process_data(tokenizer, train_split=0.95, add_unlabeled=True)
D = data_loader.get_dataset()
label_list = data_loader.get_label_list()

Loading raw text: 100%|██████████| 170/170 [00:07<00:00, 23.46it/s]
Processing raw text: 170it [00:01, 156.88it/s]
Adding unlabeled lines: 8454it [00:00, 10414.01it/s]
Formatting dataset: 16525it [00:03, 5369.16it/s]
Using custom data configuration default-710d6deeeb2c3cc3


Downloading and preparing dataset json/default to /home/m5u9s00/.cache/huggingface/datasets/json/default-710d6deeeb2c3cc3/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/m5u9s00/.cache/huggingface/datasets/json/default-710d6deeeb2c3cc3/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15593 [00:00<?, ?ex/s]

  0%|          | 0/821 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/m5u9s00/.cache/huggingface/datasets/json/default-710d6deeeb2c3cc3/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-3d9ceab58bec6d3a.arrow
Loading raw text for test: 100%|██████████| 128/128 [00:11<00:00, 10.89it/s]
Formatting test data: 13617it [00:00, 21519.79it/s]
Using custom data configuration default-7d8433eb049b5d4a


Downloading and preparing dataset json/default to /home/m5u9s00/.cache/huggingface/datasets/json/default-7d8433eb049b5d4a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/m5u9s00/.cache/huggingface/datasets/json/default-7d8433eb049b5d4a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/13617 [00:00<?, ?ex/s]

In [4]:
# model = AutoModelForTokenClassification.from_pretrained("logs/model_60_warmup_epochs")

In [9]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

loading configuration file https://huggingface.co/giacomomiolo/electramed_base_scivocab_1M/resolve/main/config.json from cache at /home/m5u9s00/.cache/huggingface/transformers/c482a39d4ee91eb23933abaff8eba8ae535cd80563e3b66caa41934b37d52138.1c949c4a68e30225f35ee7db82195ffff60db502f0c9b37ed1cc866f8708614a
Model config ElectraConfig {
  "_name_or_path": "giacomomiolo/electramed_base_scivocab_1M",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "i

### Training

In [10]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"training_logs/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    # learning_rate=1e-5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.05,
    logging_steps=5,

    warmup_ratio=0.1,
)
data_collator = DataCollatorForTokenClassification(tokenizer)

# def masking(l):
#     if l >= 5:
#         if l % 2 == 0:
#             return 6
#         else :
#             return 5
#     else :
#         return l
        
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [11]:
trainer = Trainer(
    model,
    args,
    train_dataset=D["train"],
    eval_dataset=D["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [12]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: associated with someone else_indices_start, treatment, conditional_indices_start, absent_indices_start, conditional_indices_end, hypothetical_indices_start, offset_mapping, test_indices_end, hypothetical, present_indices_end, text, possible_indices_start, absent, associated with someone else_indices_end, associated with someone else, possible, present_indices_start, test_indices_start, test, treatment_indices_end, possible_indices_end, conditional, row, absent_indices_end, present, treatment_indices_start, filename, hypothetical_indices_end.
***** Running training *****
  Num examples = 15593
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3660


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5158,0.47472,0.556845,0.583942,0.570071,0.882645
2,0.2414,0.229068,0.703125,0.766423,0.733411,0.938635
3,0.1771,0.202665,0.739679,0.784672,0.761511,0.942793
4,0.1408,0.185424,0.77278,0.815085,0.793369,0.949488
5,0.0754,0.182688,0.796253,0.827251,0.811456,0.953342
6,0.0975,0.180355,0.786866,0.8309,0.808284,0.954559
7,0.0473,0.192984,0.786127,0.827251,0.806165,0.952936
8,0.076,0.201601,0.801854,0.841849,0.821365,0.953951
9,0.0482,0.204951,0.821262,0.855231,0.837902,0.956385
10,0.0596,0.199271,0.804907,0.8382,0.821216,0.955776


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: associated with someone else_indices_start, treatment, conditional_indices_start, absent_indices_start, conditional_indices_end, hypothetical_indices_start, offset_mapping, test_indices_end, hypothetical, present_indices_end, text, possible_indices_start, absent, associated with someone else_indices_end, associated with someone else, possible, present_indices_start, test_indices_start, test, treatment_indices_end, possible_indices_end, conditional, row, absent_indices_end, present, treatment_indices_start, filename, hypothetical_indices_end.
***** Running Evaluation *****
  Num examples = 821
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: associated with someone else_indices_start, treatment, conditional_indices_start, absent

TrainOutput(global_step=3660, training_loss=0.17354840086416795, metrics={'train_runtime': 1695.2192, 'train_samples_per_second': 137.973, 'train_steps_per_second': 2.159, 'total_flos': 7518366901071588.0, 'train_loss': 0.17354840086416795, 'epoch': 15.0})

In [None]:
# trainer.model = AutoModelForTokenClassification.from_pretrained("./training_logs/electramed_base_scivocab_1M-finetuned-ner/checkpoint-2500", local_files_only=True)
# trainer.model.to("cuda")

### Performence on validation

In [37]:
predictions, labels, _ = trainer.predict(D["val"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: associated with someone else_indices_start, treatment, conditional_indices_start, absent_indices_start, conditional_indices_end, hypothetical_indices_start, offset_mapping, test_indices_end, hypothetical, present_indices_end, text, possible_indices_start, absent, associated with someone else_indices_end, associated with someone else, possible, present_indices_start, test_indices_start, test, treatment_indices_end, possible_indices_end, conditional, row, absent_indices_end, present, treatment_indices_start, filename, hypothetical_indices_end.
***** Running Prediction *****
  Num examples = 821
  Batch size = 64


{'ABSENT': {'precision': 0.819672131147541,
  'recall': 0.8620689655172413,
  'f1': 0.8403361344537814,
  'number': 58},
 'CONDITIONAL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3},
 'HYPOTHETICAL': {'precision': 0.625,
  'recall': 1.0,
  'f1': 0.7692307692307693,
  'number': 5},
 'POSSIBLE': {'precision': 0.6451612903225806,
  'recall': 0.8,
  'f1': 0.7142857142857142,
  'number': 25},
 'PRESENT': {'precision': 0.8294573643410853,
  'recall': 0.8492063492063492,
  'f1': 0.8392156862745098,
  'number': 252},
 'TEST': {'precision': 0.8084291187739464,
  'recall': 0.8791666666666667,
  'f1': 0.8423153692614771,
  'number': 240},
 'TREATMENT': {'precision': 0.8367346938775511,
  'recall': 0.8577405857740585,
  'f1': 0.8471074380165289,
  'number': 239},
 'overall_precision': 0.815028901734104,
 'overall_recall': 0.8576642335766423,
 'overall_f1': 0.8358032009484292,
 'overall_accuracy': 0.9560807384116036}

In [38]:
trainer.save_model("logs/electramed_15_epochs")

Saving model checkpoint to logs/electramed_15_epochs
Configuration saved in logs/electramed_15_epochs/config.json
Model weights saved in logs/electramed_15_epochs/pytorch_model.bin
tokenizer config file saved in logs/electramed_15_epochs/tokenizer_config.json
Special tokens file saved in logs/electramed_15_epochs/special_tokens_map.json


# Reformat to the initial format

In [39]:
# prediction = np.load("../data/prediction.npy")
# print("min :", min(prediction.reshape(-1)))
# print("max :", max(prediction.reshape(-1)))
# prediction = predictions_2
test_data = D["test"]
prediction, _, _ = trainer.predict(D["test"])
prediction = np.argmax(prediction, axis=2)

The following columns in the test set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, filename, row, text.
***** Running Prediction *****
  Num examples = 13617
  Batch size = 64


In [40]:
label_list
token_to_label = {token: token.split("-")[-1] for token in label_list}
token_id_to_label = {i: token_to_label[token].lower() for i, token in enumerate(label_list)}
token_id_to_label

{0: 'o',
 1: 'test',
 2: 'test',
 3: 'treatment',
 4: 'treatment',
 5: 'present',
 6: 'present',
 7: 'absent',
 8: 'absent',
 9: 'possible',
 10: 'possible',
 11: 'conditional',
 12: 'conditional',
 13: 'hypothetical',
 14: 'hypothetical',
 15: 'associated with someone else',
 16: 'associated with someone else'}

In [41]:
ast_to_concept = {
    "test" : "test",
    "treatment" : "treatment",
    "present" : "problem",
    "absent" : "problem",
    "possible" : "problem",
    "conditional" : "problem",
    "hypothetical" : "problem",
    "associated with someone else" : "problem"
}

In [42]:
from utils.save_predictions import save_predictions

# save_predictions(test_data, prediction)
save_predictions(test_data, prediction)

### Viz

In [32]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [34]:
effect_ner_model(D["train"][4]["text"])

[{'entity': 'LABEL_1',
  'score': 0.99998295,
  'index': 1,
  'word': 'her',
  'start': 0,
  'end': 3},
 {'entity': 'LABEL_2',
  'score': 0.99998784,
  'index': 2,
  'word': 'coagulation',
  'start': 4,
  'end': 15},
 {'entity': 'LABEL_2',
  'score': 0.9999907,
  'index': 3,
  'word': 'parameters',
  'start': 16,
  'end': 26},
 {'entity': 'LABEL_2',
  'score': 0.9999912,
  'index': 4,
  'word': 'were',
  'start': 27,
  'end': 31},
 {'entity': 'LABEL_2',
  'score': 0.99999094,
  'index': 5,
  'word': 'normal',
  'start': 32,
  'end': 38},
 {'entity': 'LABEL_2',
  'score': 0.9999906,
  'index': 6,
  'word': '.',
  'start': 39,
  'end': 40}]

In [39]:
from spacy import displacy
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [44]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(D["train"]))
    visualize_entities(D["train"][index]["text"])
    print(f"Text: {D['train'][index]['text']}")
    # print(f"Problems: {D['D'][index]['problem']}")
    print(f"Tests: {D['train'][index]['test']}")
    print(f"Treatments: {D['train'][index]['treatment']}")
    print(f"Predent: {D['train'][index]['present']}")
    print(f"{'*' * 50}\n")

Text: at osh , sputum cultures grew pseudomonas and mrsa , and he was restarted on zosyn ( 08-08 ) and vancomycin ( 08-09 ).
Tests: ['sputum cultures']
Treatments: ['vancomycin', 'zosyn']
Predent: ['mrsa', 'pseudomonas']
**************************************************



Text: the liver was 11 cm by percussion .
Tests: ['percussion']
Treatments: []
Predent: []
**************************************************



Text: hepatitis 35 years ago .
Tests: []
Treatments: []
Predent: ['hepatitis']
**************************************************



Text: Fluid , electrolytes and nutritions :
Tests: []
Treatments: []
Predent: []
**************************************************



Text: toprol xl 25 mg qday
Tests: []
Treatments: ['toprol xl']
Predent: []
**************************************************

