In [39]:
%reload_ext autoreload
%autoreload 2

import numpy as np
from data_preprocessing import Get_and_process_data
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

In [40]:

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [195]:
data_loader = Get_and_process_data(tokenizer, train_split=0.95, add_unlabeled=False)
D = data_loader.get_dataset()
label_list = data_loader.get_label_list()

Loading raw text: 100%|██████████| 170/170 [00:09<00:00, 18.73it/s]
Processing raw text: 170it [00:00, 192.46it/s]
Adding unlabeled lines: 8454it [00:01, 4922.55it/s]
Formatting dataset: 16525it [00:02, 7264.53it/s]
Using custom data configuration default-468b556b8c574403


Downloading and preparing dataset json/default to /home/mus5900/.cache/huggingface/datasets/json/default-468b556b8c574403/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/mus5900/.cache/huggingface/datasets/json/default-468b556b8c574403/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15593 [00:00<?, ?ex/s]

  0%|          | 0/821 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/mus5900/.cache/huggingface/datasets/json/default-468b556b8c574403/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-5814df5346ec4747.arrow
Loading raw text for test: 100%|██████████| 128/128 [00:09<00:00, 13.02it/s]
Formatting test data: 13617it [00:00, 19259.50it/s]
Using custom data configuration default-f5596ef569de00d0


Downloading and preparing dataset json/default to /home/mus5900/.cache/huggingface/datasets/json/default-f5596ef569de00d0/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/mus5900/.cache/huggingface/datasets/json/default-f5596ef569de00d0/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/13617 [00:00<?, ?ex/s]

In [24]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

loading configuration file https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json from cache at /home/m5u9s00/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL

In [25]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"training_logs/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    # learning_rate=1e-5,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.05,
    logging_steps=5,

    warmup_ratio=0.1,
)
data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer = Trainer(
    model,
    args,
    train_dataset=D["train"],
    eval_dataset=D["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [27]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: absent_indices_end, hypothetical_indices_start, possible, hypothetical, offset_mapping, conditional_indices_start, associated with someone else_indices_end, test, present_indices_end, present_indices_start, test_indices_start, associated with someone else, absent_indices_start, treatment_indices_start, possible_indices_end, hypothetical_indices_end, text, treatment_indices_end, associated with someone else_indices_start, conditional_indices_end, possible_indices_start, treatment, conditional, present, absent, test_indices_end.
***** Running training *****
  Num examples = 15593
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9760


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6006,0.684513,0.269297,0.300191,0.283906,0.793004
2,0.3685,0.410699,0.449927,0.592734,0.511551,0.870241
3,0.3042,0.339691,0.536474,0.674952,0.597798,0.894957
4,0.2356,0.277114,0.605307,0.697897,0.648313,0.912896
5,0.0892,0.295672,0.648148,0.736138,0.689346,0.913992
6,0.0586,0.326391,0.633224,0.736138,0.680813,0.91459
7,0.0325,0.326181,0.666667,0.760994,0.710714,0.924357
8,0.0206,0.35294,0.673469,0.75717,0.712871,0.922962
9,0.0742,0.378488,0.708042,0.774379,0.739726,0.926251
10,0.0591,0.393227,0.684838,0.768642,0.724324,0.925653


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: absent_indices_end, hypothetical_indices_start, possible, hypothetical, offset_mapping, conditional_indices_start, associated with someone else_indices_end, test, present_indices_end, present_indices_start, test_indices_start, associated with someone else, absent_indices_start, treatment_indices_start, possible_indices_end, hypothetical_indices_end, text, treatment_indices_end, associated with someone else_indices_start, conditional_indices_end, possible_indices_start, treatment, conditional, present, absent, test_indices_end.
***** Running Evaluation *****
  Num examples = 821
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to training_logs/scibert_scivocab_uncased-finetuned-ner/checkpoint-500
Configuration saved in training_logs/scibert_scivocab_uncased-finetuned-ner/checkpoint-500/config.json


TrainOutput(global_step=9760, training_loss=0.15571206227638168, metrics={'train_runtime': 2881.2935, 'train_samples_per_second': 108.236, 'train_steps_per_second': 3.387, 'total_flos': 8168461354174668.0, 'train_loss': 0.15571206227638168, 'epoch': 20.0})

In [28]:
predictions, labels, _ = trainer.predict(D["val"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: absent_indices_end, hypothetical_indices_start, possible, hypothetical, offset_mapping, conditional_indices_start, associated with someone else_indices_end, test, present_indices_end, present_indices_start, test_indices_start, associated with someone else, absent_indices_start, treatment_indices_start, possible_indices_end, hypothetical_indices_end, text, treatment_indices_end, associated with someone else_indices_start, conditional_indices_end, possible_indices_start, treatment, conditional, present, absent, test_indices_end.
***** Running Prediction *****
  Num examples = 821
  Batch size = 32


{'ABSENT': {'precision': 0.7391304347826086,
  'recall': 0.8225806451612904,
  'f1': 0.7786259541984734,
  'number': 62},
 'CONDITIONAL': {'precision': 0.14285714285714285,
  'recall': 0.25,
  'f1': 0.18181818181818182,
  'number': 4},
 'HYPOTHETICAL': {'precision': 0.4,
  'recall': 0.5,
  'f1': 0.4444444444444445,
  'number': 8},
 'POSSIBLE': {'precision': 0.375,
  'recall': 0.3333333333333333,
  'f1': 0.35294117647058826,
  'number': 9},
 'PRESENT': {'precision': 0.6534090909090909,
  'recall': 0.7467532467532467,
  'f1': 0.696969696969697,
  'number': 154},
 'TEST': {'precision': 0.7111111111111111,
  'recall': 0.7804878048780488,
  'f1': 0.7441860465116279,
  'number': 123},
 'TREATMENT': {'precision': 0.7965116279069767,
  'recall': 0.8404907975460123,
  'f1': 0.8179104477611939,
  'number': 163},
 'overall_precision': 0.7053726169844021,
 'overall_recall': 0.7782026768642447,
 'overall_f1': 0.7400000000000001,
 'overall_accuracy': 0.9248554913294798}

In [29]:
predictions_2, labels_2, _ = trainer.predict(D["test"])
predictions_2 = np.argmax(predictions_2, axis=2)

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: absent_indices_end, hypothetical_indices_start, possible, hypothetical, offset_mapping, conditional_indices_start, associated with someone else_indices_end, test, present_indices_end, present_indices_start, test_indices_start, associated with someone else, absent_indices_start, treatment_indices_start, possible_indices_end, hypothetical_indices_end, text, treatment_indices_end, associated with someone else_indices_start, conditional_indices_end, possible_indices_start, treatment, conditional, present, absent, test_indices_end.
***** Running Prediction *****
  Num examples = 16414
  Batch size = 32


In [30]:
x= 0
u = 0
for i in range(x, x+16000):
    if sum(predictions_2[i])>0:
        u+=1
        # print("i = ", i, end="\t -> ")
        # print(sum(predictions_2[i]))
print("u = ", u)

# 7510

u =  7481


In [61]:
predictions_2[194]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       0, 4, 0, 4, 4, 4, 4, 4, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
# model.save_pretrained("logs/model_50_warmup_epochs")

Configuration saved in logs/model_50_warmup_epochs/config.json
Model weights saved in logs/model_50_warmup_epochs/pytorch_model.bin


# Reformat to the initial format

In [146]:
prediction = np.load("../data/prediction.npy")
print("min :", min(prediction.reshape(-1)))
print("max :", max(prediction.reshape(-1)))

min : 0
max : 14


In [166]:
label_list
token_to_label = {token: token.split("-")[-1] for token in label_list}
token_id_to_label = {i: token_to_label[token].lower() for i, token in enumerate(label_list)}
token_id_to_label

{0: 'o',
 1: 'test',
 2: 'test',
 3: 'treatment',
 4: 'treatment',
 5: 'present',
 6: 'present',
 7: 'absent',
 8: 'absent',
 9: 'possible',
 10: 'possible',
 11: 'conditional',
 12: 'conditional',
 13: 'hypothetical',
 14: 'hypothetical',
 15: 'associated with someone else',
 16: 'associated with someone else'}

In [169]:
ast_to_concept = {
    "test" : "test",
    "treatment" : "treatment",
    "present" : "problem",
    "absent" : "problem",
    "possible" : "problem",
    "conditional" : "problem",
    "hypothetical" : "problem",
    "associated with someone else" : "problem"
}

In [198]:
test_data = D["test"]
test_data

Dataset({
    features: ['text', 'row', 'filename', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 13617
})

In [149]:
for i,a in enumerate(D["train"]):
        print(a)
        print("len of labels :", len(a["labels"]))
        print("len of offset_mapping :", len(a["offset_mapping"]))
        print("len of input_ids :", len(a["input_ids"]))
        # print(prediction[i])
        break

{'text': 'the ventricles and sulci are mildly prominent , but stable in size and appearance .', 'row': 151, 'test': [], 'test_indices_start': [], 'test_indices_end': [], 'treatment': [], 'treatment_indices_start': [], 'treatment_indices_end': [], 'present': ['mildly prominent'], 'present_indices_start': [29], 'present_indices_end': [45], 'absent': [], 'absent_indices_start': [], 'absent_indices_end': [], 'possible': [], 'possible_indices_start': [], 'possible_indices_end': [], 'conditional': [], 'conditional_indices_start': [], 'conditional_indices_end': [], 'hypothetical': [], 'hypothetical_indices_start': [], 'hypothetical_indices_end': [], 'associated with someone else': [], 'associated with someone else_indices_start': [], 'associated with someone else_indices_end': [], 'input_ids': [102, 111, 15077, 30113, 137, 9271, 1644, 220, 28964, 9295, 422, 563, 3229, 121, 1243, 137, 6540, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask'

In [213]:
# from utils.save_predictions import save_predictions

# save_predictions(test_data, prediction)
for i,a in enumerate(test_data):
    if sum(prediction[i])==0 or i<2022:
        pass
    else:
        # print(a)
        pred = prediction[i][:len(a['input_ids'])] #remove padding zeros
        # print(pred)
        
        # -------------------------------- Real thing -------------------------------- #
        old_token = None
        splits = []
        for j,token in enumerate(pred):
            if old_token!=None and token_id_to_label[old_token] == token_id_to_label[token]: #continue same sequence
                splits[-1][1] = j
            else: #start a new sequence
                if len(splits) :
                    splits[-1][1] = j
                old_token = token
                splits.append([j, None, token])
        for split in splits :
            token = split[-1]
            if token == 0:
                continue
            mapping_list = a['offset_mapping'][split[0]:split[1]]
            mapping = [mapping_list[0][0], mapping_list[-1][1]] #by character
            # print(a["text"][mapping[0]:mapping[1]], end= "\t->\t")
            # print(token_id_to_label[token])
            
            # ------------------------------- word_mapping ------------------------------- #
            # print("mapping : ", mapping)
            word_mapping_0 = len(a['text'][:mapping[0]].strip().split(" "))
            word_mapping_1 = word_mapping_0 + len(a['text'][mapping[0]:mapping[1]].strip().split(" ")) - 1
            word_mapping = [word_mapping_0, word_mapping_1]
            
            # -------------------------------- file lines -------------------------------- #
            con_line = 'c="'+str(a["text"][mapping[0]:mapping[1]])
            con_line += '" ' 
            con_line += str(a["row"]) + ":" + str(word_mapping[0])
            con_line += ' '
            con_line += str(a["row"]) + ":" + str(word_mapping[1])
            con_line += '||t="'
            con_line += ast_to_concept[token_id_to_label[token]] + '"'
            # print("===>con_line :", con_line)
            if ast_to_concept[token_id_to_label[token]] == "problem":
                ast_line = con_line
                ast_line += '||a="'
                ast_line += token_id_to_label[token]
                ast_line += '"'
                print(ast_line)
            
            
            
            
        # print("len of offset_mapping :", len(a["offset_mapping"]))
        # print("len of input_ids :", len(a["input_ids"]))
        # break

KeyboardInterrupt: 

In [188]:
a = "doses were slowly tapered over her hospital course , and her Prograf level was adjusted to"
len(a.split())

16

In [64]:
len(test_data)

16414

TypeError: 'Dataset' object does not support item assignment