In [None]:
#import spacy
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, Trainer
import pandas as pd
from multiprocessing import Pool
#from .drugs_condition_dictionary_finder import find_drugs_normalized_output, find_drugs_and_conditions_normalized_BIO_output
import re
import torch
from datasets import load_dataset
#from .bert_helper import get_label_list, tokenize_and_align_labels
import numpy as np


In [None]:
# Functions to help deal with Hugging Face BERT models
# taken from https://github.com/michiyasunaga/LinkBERT/blob/main/src/tokcls/run_ner.py#L30
# details on padding/truncation: https://huggingface.co/docs/transformers/pad_truncation
import numpy as np

def format_output_seqeval(results, return_format):
    # see https://github.com/michiyasunaga/LinkBERT/blob/main/src/tokcls/run_ner.py#L30
    if return_format == "entity_level":
        # This is just flattening the result dict e.g. {'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0,
        # 'number': 1}, 'PER': {'precision': 1.0, 'recall': 0.5, 'f1': 0.66, 'number': 2}, 'overall_precision':
        # 0.5, 'overall_recall': 0.33, 'overall_f1': 0.4, 'overall_accuracy': 0.66} -> {'MISC_precision': 0.0,
        # 'MISC_recall': 0.0, 'MISC_f1': 0.0, 'MISC_number': 1, 'PER_precision': 1.0, 'PER_recall': 0.5,
        # 'PER_f1': 0.66, 'PER_number': 2, 'overall_precision': 0.5, 'overall_recall': 0.33, 'overall_f1': 0.4,
        # 'overall_accuracy': 0.66} Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    if return_format == "macro":
        Ps, Rs, Fs = [], [], []
        for type_name in results:
            if type_name.startswith("overall"):
                continue
            print('type_name', type_name)
            Ps.append(results[type_name]["precision"])
            Rs.append(results[type_name]["recall"])
            Fs.append(results[type_name]["f1"])
        return {
            "macro_precision": np.mean(Ps),
            "macro_recall": np.mean(Rs),
            "macro_f1": np.mean(Fs),
        }
    if return_format == "overall":
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    else:
        return results

def get_label_list(labels):
    print("*** returning unique labels ***")
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

# label_all_tokens: "Whether to put the label for one word on all tokens of generated by that word
# or just on the one (in which case the other tokens will have a padding index)."
def tokenize_and_align_labels(examples, tokenizer, text_column_name, label_column_name, label_to_id,
                              label_all_tokens=False, padding=True, max_length=512):
    print("*** Tokenize and Align Labels ***")
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=max_length,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    word_ids_list = []
    tokens_list = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        tokenized_tokens = tokenized_inputs.tokens(batch_index=i)
        word_ids_list.append(word_ids)
        previous_word_idx = None
        label_ids = []
        tokens_sub = [] # want to find out which of the tokens will be evaluated at the end, taking word-piece tokenization into account
        for i, word_idx in enumerate(word_ids):
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
                tokens_sub.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
                tokens_sub.append(tokenized_tokens[i])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
                tokens_sub.append(tokenized_tokens[i] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
        tokens_list.append(tokens_sub)

    tokenized_inputs["labels"] = labels
    tokenized_inputs["word_ids"] = word_ids_list
    #tokenized_inputs["tokenized_tokens"] = tokens_list # TODO: cauzes an ArrowInvalid error.
    return tokenized_inputs


In [25]:
class NERModel:
    def __init__(self, model_type, model_name, model_path=None, entity_class_names_dict=None):
        self.model_name = model_name
        if "/" in model_name:
            self.model_name_short = self.model_name.split("/")[1]
        else:
            self.model_name_short = self.model_name
        self.model_path = model_path
        self.model_type = model_type
        self.return_words_only = False
        self.entity_class_names_dict = entity_class_names_dict
        self.normalize_pred_representation = True  # used for the hugging face models - it removes the additional
        # information like prediction confidence that other models don't provide
        self.load_model()

    def load_model(self):
        if self.model_type == "spacy":
            self.nlp = spacy.load(self.model_path)
        elif self.model_type == "huggingface":
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForTokenClassification.from_pretrained(self.model_path)
            self.config = AutoConfig.from_pretrained(self.model_path)
            self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer,
                                grouped_entities=True)  # grouped entities False to analyze the tokenization of the models
        elif self.model_type == "regex":
            pass
        else:
            raise ValueError("Wrong model type. Allowed values are spacy, regex, and huggingface.")

    def bert_predict_bio_format(self, ds_path_train, ds_path_test, text_column_name, label_column_name,
                                label_all_tokens=False, padding=True, max_length=512):
        model = self.model
        data_files = {"train": ds_path_train,
                      "test": ds_path_test}
        raw_datasets = load_dataset("json", data_files=data_files)
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
        id_to_label = {i: l for i, l in enumerate(label_list)}

        predict_dataset = raw_datasets["test"]
        predict_dataset = predict_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            desc="Running tokenizer on prediction dataset",
            fn_kwargs={
                "tokenizer": self.tokenizer,
                "text_column_name": text_column_name,
                "label_column_name": label_column_name,
                "label_to_id": label_to_id,
                "label_all_tokens": label_all_tokens,
                "padding": padding,
                "max_length": max_length
            }
        )
        trainer = Trainer(
            model=model,
            train_dataset=None,
            eval_dataset=None,
            tokenizer=self.tokenizer,
        )

        results = trainer.predict(predict_dataset)
        predictions = results.predictions
        predictions = np.argmax(predictions, axis=2)
        mapped_list_of_predictions = [np.array([id_to_label[id] for id in arr]) for arr in predictions]

        tokenized_inputs = self.tokenizer(
            predict_dataset['tokens'],
            padding=True,
            truncation=True,
            max_length=512,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        predict_dataset = predict_dataset.add_column("word_tokenized_input",
                                                     [tokenized_inputs.tokens(batch_index=i) for i in
                                                      range(len(predict_dataset))])
        predict_dataset_with_pred = predict_dataset.add_column(
            "predictions_bio_mapped_{}".format(self.model_name_short), mapped_list_of_predictions)
        predict_dataset_with_pred = predict_dataset_with_pred.add_column("predictions_{}".format(self.model_name_short),
                                                                         list(predictions))

        return predict_dataset_with_pred

    def annotate(self, file_path, source_column, sep=","):
        df = pd.read_csv(file_path, sep=sep)
        if self.normalize_pred_representation:
            predictions_col_name = "ner_prediction_{}_normalized".format(self.model_name_short)
        else:
            predictions_col_name = "ner_prediction_{}".format(self.model_name_short)
        # df[predictions_col_name] = df[source_column].apply(self.infer_ner)
        df[predictions_col_name] = df.apply(lambda row: self.infer_ner(row[source_column], row["tokens"]),
                                            axis=1)
        return df

    def annotate_parallel(self, file_path, source_column, sep=","):
        df = pd.read_csv(file_path, sep=sep)
        num_processes = 10
        chunks = [df[i:i + len(df) // num_processes] for i in
                  range(0, len(df), len(df) // num_processes)]

        with Pool(num_processes) as pool:
            # Apply the function to each chunk in parallel
            results = pool.starmap(self.infer_ner_chunk,
                                   [(chunk, source_column) for chunk in chunks])
        return pd.concat(results)

    def infer_ner_chunk(self, chunk, text_source_column):
        predictions_col_name = "ner_prediction_{}".format(self.model_name_short)
        chunk[predictions_col_name] = chunk[text_source_column].apply(self.infer_ner)
        if self.model_type == "huggingface":
            self.normalize_pred_representation = True
            predictions_col_name = "ner_prediction_{}_normalized".format(self.model_name_short)
            chunk[predictions_col_name] = chunk[text_source_column].apply(self.infer_ner)
            # predictions_col_name = "ner_prediction_{}_bio".format(self.model_type)
            # chunk[predictions_col_name] = chunk[text_source_column].apply(self.ner_bert_bio_output)
        return chunk

    def ner_bert_bio_output(self, sentence):
        # sentence = normalizer.normalize(sentence)

        model, tokenizer, labels = self.model, self.tokenizer, list(self.config.label2id.keys())
        #### check length limit ####
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        num_tokens = len(tokenized_sentence)
        if num_tokens > 512:
            tokenized_sentence = tokenized_sentence[:510]
            sentence = self.tokenizer.convert_tokens_to_string(tokenized_sentence)
        #### check length limit ####

        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
        inputs = tokenizer.encode(sentence, return_tensors="pt")
        outputs = model(inputs)[0]
        predictions = torch.argmax(outputs, axis=2)
        predictions = [(token, labels[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())]

        filtered_predictions = []
        prev_token, prev_label = None, None

        # dealing with the misaligned tokenizations
        # TODO: seems too manual... trying to fit to the the prodigy tokenization here
        for token, label in predictions:
            if prev_token == 'cann' and token == '##ot' or (
                    token == "##mg"):  # spacy splits connot into "can" and "not", two tokens
                filtered_predictions.append(label)
            elif token.startswith("##®") or (not token.startswith('##') and token not in ['[SEP]',
                                                                                          '[CLS]']):  # ignore the word pieces that would inflate the array
                filtered_predictions.append(label)
            prev_token, prev_label = token, label

        return filtered_predictions

    def infer_ner(self, sent, tokenized_sent=None):
        if self.model_type == "spacy":
            doc = self.nlp(sent)
            entities = []
            for ent in doc.ents:
                if self.return_words_only:  # TODO where is the right place?
                    entities.append(ent.text)
                else:
                    ent_label = ent.label_
                    if ent_label == "CHEMICAL":  # TODO: handle multi-class case!
                        entities.append((ent.start_char, ent.end_char, ent.text))
            return list(set(entities))
        elif self.model_type == "huggingface":
            tokenized_sentence = self.tokenizer.tokenize(sent)
            num_tokens = len(tokenized_sentence)
            if num_tokens > 512:
                print("Number of tokens ({}) too large, truncating sentence: {}".format(num_tokens, sent[:50] + "[...]"))
                tokenized_sentence = tokenized_sentence[:510]
                sent = self.tokenizer.convert_tokens_to_string(tokenized_sentence)
            ner_results = self.nlp(sent)
            # ner_results_combined = self.combine_entity_subwords(sent, ner_results) NOT NEEDED IF WE ARE USING THE GROUPED ENTITIES FLAG
            if self.normalize_pred_representation:
                return self.normalize_representation(ner_results)
            else:
                return ner_results
        elif self.model_type == "regex":
            tokens_cleaned = tokenized_sent
            return find_drugs_and_conditions_normalized_BIO_output(
                tokens_cleaned)  # returns a tuple drug_matches, {"entites":all_char_indices}
        else:
            raise ValueError("Wrong model type. Allowed values are spacy and huggingface.")

    def normalize_representation(self, ner_results_combined):
        results = []
        for ent_dict in ner_results_combined:
            if self.entity_class_names_dict:
                entity_class_full_name = self.entity_class_names_dict[ent_dict['entity_group']]
            else:
                entity_class_full_name = ent_dict['entity_group']
            results.append((ent_dict['start'], ent_dict['end'], entity_class_full_name, ent_dict['word']))
        return results


    def combine_entity_subwords(self, sent, entities):

        result = []
        current_entity = None
        merged_dict = {}
        if not entities:
            return []

        for entity in entities:
            if entity["entity"].startswith("B") or entity["entity"].startswith("I"):
                if current_entity is not None and entity["entity"].startswith("I"):
                    if entity['word'].startswith('##'):
                        current_entity['word'] += entity['word'][2:]
                    elif current_entity['end'] == entity['start']:
                        current_entity['word'] += entity['word']  # no space needed
                    else:
                        current_entity['word'] += " "
                        current_entity['word'] += entity['word']
                    current_entity['end'] = entity['end']
                    current_entity['score'] = round((current_entity['score'] + entity['score']) / 2,
                                                    3)  # average of the confidence
                else:
                    current_entity = entity.copy()
                    current_entity['entity'] = entity['entity'].replace("B-", "").replace("I-",
                                                                                          "")  # TODO: does the I- make sense?
                    if self.entity_class_names_dict:
                        current_entity['entity'] = self.entity_class_names_dict[current_entity['entity']]
                    current_entity['word'] = entity['word'][2:] if entity['word'].startswith('##') else entity['word']
                    result.append(current_entity)

        if self.return_words_only:
            current_entity_start = 0
            for i, entity in enumerate(result):

                if entity['entity'] == 'B':
                    merged_dict[i] = entity['word']
                    current_entity_start = i
                elif entity['entity'] == 'I':
                    if merged_dict:
                        merged_dict[current_entity_start] += ' ' + entity['word']
                    else:
                        print("could not save entity: {} from sentence {}. All entities found: {}".format(entity, sent,
                                                                                                          entities))

            return list(merged_dict.values())
        return result


In [9]:
#from core.models import NERModel
import datetime

short_to_long_class_names_map = {
    "DRUG": "DRUG",
    "BEH": "BEHAVIOURAL",
    "SURG": "SURGICAL",
    "PHYS": "PHYSICAL",
    "RADIO": "RADIOTHERAPY",
    "OTHER": "OTHER",
    "COND": "CONDITION",
    "CTRL": "CONTROL"
}

current_date = datetime.datetime.now().strftime("%Y%m%d")

run_linkbert = False
run_biobert = False
run_bert_base_uncased = True

# TODO: Error "Placeholder storage has not been allocated on MPS device!" when trying to run tuple and BIO annotations sequentially?
run_tuples_annotations = False
run_BIO_annotations = True

relevant_data_path = "../data/annotated_data/"
corpus_files_path_prefix = relevant_data_path + "data_splits/"
train_data_path = corpus_files_path_prefix + "ct_neuro_train_data_713.json"
test_data_path = corpus_files_path_prefix + "ct_neuro_test_data_90.json"
test_data_path_csv = corpus_files_path_prefix + "ct_neuro_test_merged_90.csv"
output_annotations_path_prefix = "./predictions/"

#### BERT BASE ####
if run_bert_base_uncased:
    print("Running BERT-BASE model_annotations.")
    hugging_face_model_name = "bert-base-uncased"
    hugging_face_model_path = "./bert/trained/bert-base-uncased/epochs_15_data_size_100_iter_4/"
    model_name_str = "bert-base-uncased"
    model = NERModel("huggingface", hugging_face_model_name, hugging_face_model_path, short_to_long_class_names_map)

    ### ANNOTATE WITH BIO OUTPUT
    if run_BIO_annotations:
        predict_dataset_with_pred = model.bert_predict_bio_format(train_data_path, test_data_path, "tokens",
                                                                  "ner_tags")
        predict_dataset_with_pred.to_csv(
            output_annotations_path_prefix + "ct_neuro_test_annotated_{}_BIO_{}_test.csv".format(model_name_str,
                                                                                            current_date), sep=",")
    if run_tuples_annotations:
        ### ANNOTATE WITH TUPLE OUTPUT
        annotated_ds = model.annotate(test_data_path_csv, "text")
        annotated_ds.to_csv(
            output_annotations_path_prefix + "ct_neuro_test_annotated_{}_{}_test.csv".format(model_name_str,
                                                                                        current_date), sep=",")


Running BERT-BASE model_annotations.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

*** returning unique labels ***


Running tokenizer on prediction dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

*** Tokenize and Align Labels ***


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
from datasets import load_dataset, load_metric
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2


In [22]:
def evaluate_bert_bio(target_annotated_file_name, train_file_name, train_label_column_name,
                      return_format="all", target_labels_column="labels", predicted_labels_column="predictions"):
    df = pd.read_csv(target_annotated_file_name)

    def convert_to_list(string):
        string = string.strip('[]')  # Remove the brackets
        return list(map(int, string.split()))

    predictions = np.array(df[predicted_labels_column].apply(convert_to_list).to_list())
    labels = np.array(df[target_labels_column].apply(convert_to_list).to_list())

    data_files = {"train": train_file_name}
    raw_datasets = load_dataset("json", data_files=data_files)
    label_list = get_label_list(raw_datasets["train"][train_label_column_name])

    print("predicted_labels_column: ", predicted_labels_column)
    print("len: ", len(predictions))
    print("target_labels_column: ", target_labels_column)
    print("len: ", len(labels))

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = load_metric("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)

    combined_predictions = [item for sublist in true_predictions for item in sublist]
    combined_target = [item for sublist in true_labels for item in sublist]
    #self.calculate_overall_cohen_kappa_with_ci(combined_predictions, combined_target)

    print("Classification Report")
    print(classification_report(true_labels, true_predictions))
    print("Evaluation Mode STRICT")
    print(classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2))

    return format_output_seqeval(results, return_format)

In [23]:
model_name_str = "bert-base-uncased"
annotated_data_path = output_annotations_path_prefix + "ct_neuro_test_annotated_{}_{}_test.csv".format(model_name_str, "20240131")
annotated_data_path_bio = output_annotations_path_prefix + "ct_neuro_test_annotated_{}_BIO_{}_test.csv".format(model_name_str, "20240131")

In [24]:
print(evaluate_bert_bio(annotated_data_path_bio, "../data/annotated_data/data_splits/ct_neuro_train_data_713.json",
                                          "ner_tags", return_format="all", target_labels_column="labels",
                                          predicted_labels_column=f"predictions_{model_name_str}"))

*** returning unique labels ***
predicted_labels_column:  predictions_bert-base-uncased
len:  90
target_labels_column:  labels
len:  90
Classification Report
              precision    recall  f1-score   support

         BEH       0.23      0.15      0.18        48
        COND       0.77      0.78      0.78       468
        CTRL       0.54      0.50      0.52        38
        DRUG       0.72      0.88      0.79       128
       OTHER       0.33      0.41      0.37       134
        PHYS       0.51      0.39      0.44        66
       RADIO       0.00      0.00      0.00         1
        SURG       0.00      0.00      0.00         0

   micro avg       0.64      0.66      0.65       883
   macro avg       0.39      0.39      0.38       883
weighted avg       0.64      0.66      0.65       883

Evaluation Mode STRICT
              precision    recall  f1-score   support

         BEH       0.32      0.15      0.20        48
        COND       0.79      0.78      0.79       468
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
