### This was ran on Colab for GPU usage

In [None]:
pip install datasets transformers seqeval

In [1]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
#model_checkpoint = "dmis-lab/biobert-v1.1" #biobert
#model_checkpoint = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12' #bluebert
#model_checkpoint = 'NLP4H/ms_bert'
#model_checkpoint = 'distilbert-base-uncased'
#model_checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
model_checkpoint = 'allenai/scibert_scivocab_uncased'
batch_size = 8
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
import numpy as np
from datasets import load_from_disk

tokenized_datasets = load_from_disk('/content/umn_40_tokenized_dataset_SciBERT_addedtokens_0823_t1v1.json')
label_list = np.load('/content/unique_labs_umn_40_0804.npy', allow_pickle=True)

## Text Classification

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"umn-{model_name}-finetuned-textclass",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    save_strategy = 'no'
)

## Token Classification

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"umncrf-{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    save_strategy = 'no'
)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

#on Colab:
#!zip -r /content/drive/MyDrive/umn_203_6ep_scibert_scivocab_uncased-finetuned-ner_0823.zip /content/umnfull-scibert_scivocab_uncased-finetuned-ner

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions_ = np.argmax(predictions, axis=2)

### Token classification performance functions

In [7]:
from datasets import load_dataset, load_metric
from seqeval.metrics import f1_score as seq_f1
from seqeval.metrics import precision_score, recall_score, classification_report

def get_AD_performance(predictions, labels):
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    f1_actual = np.round(seq_f1(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
    pre_actual = np.round(precision_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
    rec_actual = np.round(recall_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )

    class_report = classification_report(true_labels, true_predictions, output_dict=True )
    f1s = []
    precs = []
    recs = []
    weights = []

    for lab in class_report:
        if lab not in ['micro avg', 'macro avg','weighted avg', 'A_word']:
            f1s.append(class_report[lab]['f1-score'])
            precs.append(class_report[lab]['precision'])
            recs.append(class_report[lab]['recall'])
            weights.append(class_report[lab]['support'])
    
    return (f1_actual, pre_actual, rec_actual), (np.average(f1s, weights=weights), np.average(precs, weights=weights), np.average(recs, weights=weights)) 

In [8]:
def get_ID_performance(predictions, labels):
    true_predictions = []
    true_labels = []

    for prediction, label in zip(predictions, labels):
      preds = []
      labs = []
      for (p, l) in zip(prediction, label):
        if l != -100:
          if p < 203:
            preds.append('ABV')
          elif p == 203:
            preds.append('word')

          if l < 203:
            labs.append('ABV')
          elif l == 203:
            labs.append('word')
      true_predictions.append(preds)
      true_labels.append(labs)

    f1_actual_ID = np.round(seq_f1(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )
    pre_actual_ID = np.round(precision_score(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )
    rec_actual_ID = np.round(recall_score(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )

    return (f1_actual_ID, pre_actual_ID, rec_actual_ID)

In [None]:
macro_scores, weighted_scores = get_AD_performance(predictions, labels)
id_perf = get_ID_performance(predictions, labels)

## "Second-Best" Post processing prediction

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])

In [None]:
tr_set = tokenized_datasets['train']
tst_set = tokenized_datasets['test']
unique_input_ids = {x for l in tr_set['input_ids'] for x in l}

possible_labs_dict = {k:set([list(label_list).index('NA_word')]) for k in unique_input_ids}

tst_set = tokenized_datasets['test']
unique_input_ids = {x for l in tr_set['input_ids'] for x in l}

possible_labs_dict = {k:set([list(label_list).index('NA_word')]) for k in unique_input_ids}

In [1]:
def post_process_preds(labs_dict, preds, tst_set):
    final_preds = []
    for words_lst, preds_lsts in zip(tst_set['input_ids'], preds):
        #print(len(words_lst),len( preds_lsts[:len(words_lst)]))
        updated_preds_list = []

        for word_idx, pred in zip(words_lst, preds_lsts[:len(words_lst)]):
        
            top_two_ids = np.argsort(pred)[-2:]  # Top two labels
            if word_idx in labs_dict.keys():
                possible_preds = labs_dict[word_idx]
                #print(possible_preds, top_two_ids)
                if pred[top_two_ids[1]] - pred[top_two_ids[0]] < 0.03 and top_two_ids[0] in possible_preds: ## If second best likelihood is close enough to best than predict second best
                    actual_pred = top_two_ids[0]
                else:
                    actual_pred = top_two_ids[1]
            
            else:
                if pred[top_two_ids[1]] - pred[top_two_ids[0]] < 0.01:
                    #print('yes')
                    actual_pred = top_two_ids[0]
                else:
                    actual_pred = top_two_ids[1]

            updated_preds_list.append(actual_pred)
        final_preds.append(updated_preds_list)

    return final_preds
    

In [None]:
post_preds = post_process_preds(possible_labs_dict, predictions, tst_set, labels)

In [None]:
from keras_preprocessing.sequence import pad_sequences
post_preds = pad_sequences(post_preds, padding='post', value=0)

In [None]:
macro_scores, weighted_scores = get_AD_performance(post_preds, labels)

## Text classification Metrics

In [None]:

predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions_max = np.argmax(predictions, axis=-1)

In [None]:
true_labels = [label_list[i] for i in labels ]
true_predictions = [label_list[i] for i in predictions_max]

In [None]:
from sklearn.metrics import precision_score, recall_score, classification_report, f1_score

f1_actual = np.round(f1_score(true_labels, true_predictions, average='macro' ) * 100, 2 )
pre_actual = np.round(precision_score(true_labels, true_predictions, average='macro' ) * 100, 2 )
rec_actual = np.round(recall_score(true_labels, true_predictions, average='macro', zero_division=0 ) * 100, 2 )